drivers/gpu/drm/i915/gt/intel_lrc.c

   1 /*
   2  * Copyright © 2014 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Ben Widawsky <ben@bwidawsk.net>
  25  *    Michel Thierry <michel.thierry@intel.com>
  26  *    Thomas Daniel <thomas.daniel@intel.com>
  27  *    Oscar Mateo <oscar.mateo@intel.com>
  28  *
  29  */
  30
  31 /**
  32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
  33  *
  34  * Motivation:
  35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
  36  * These expanded contexts enable a number of new abilities, especially
  37  * "Execlists" (also implemented in this file).
  38  *
  39  * One of the main differences with the legacy HW contexts is that logical
  40  * ring contexts incorporate many more things to the context's state, like
  41  * PDPs or ringbuffer control registers:
  42  *
  43  * The reason why PDPs are included in the context is straightforward: as
  44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
  45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
  46  * instead, the GPU will do it for you on the context switch.
  47  *
  48  * But, what about the ringbuffer control registers (head, tail, etc..)?
  49  * shouldn't we just need a set of those per engine command streamer? This is
  50  * where the name "Logical Rings" starts to make sense: by virtualizing the
  51  * rings, the engine cs shifts to a new "ring buffer" with every context
  52  * switch. When you want to submit a workload to the GPU you: A) choose your
  53  * context, B) find its appropriate virtualized ring, C) write commands to it
  54  * and then, finally, D) tell the GPU to switch to that context.
  55  *
  56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
  57  * to a contexts is via a context execution list, ergo "Execlists".
  58  *
  59  * LRC implementation:
  60  * Regarding the creation of contexts, we have:
  61  *
  62  * - One global default context.
  63  * - One local default context for each opened fd.
  64  * - One local extra context for each context create ioctl call.
  65  *
  66  * Now that ringbuffers belong per-context (and not per-engine, like before)
  67  * and that contexts are uniquely tied to a given engine (and not reusable,
  68  * like before) we need:
  69  *
  70  * - One ringbuffer per-engine inside each context.
  71  * - One backing object per-engine inside each context.
  72  *
  73  * The global default context starts its life with these new objects fully
  74  * allocated and populated. The local default context for each opened fd is
  75  * more complex, because we don't know at creation time which engine is going
  76  * to use them. To handle this, we have implemented a deferred creation of LR
  77  * contexts:
  78  *
  79  * The local context starts its life as a hollow or blank holder, that only
  80  * gets populated for a given engine once we receive an execbuffer. If later
  81  * on we receive another execbuffer ioctl for the same context but a different
  82  * engine, we allocate/populate a new ringbuffer and context backing object and
  83  * so on.
  84  *
  85  * Finally, regarding local contexts created using the ioctl call: as they are
  86  * only allowed with the render ring, we can allocate & populate them right
  87  * away (no need to defer anything, at least for now).
  88  *
  89  * Execlists implementation:
  90  * Execlists are the new method by which, on gen8+ hardware, workloads are
  91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
  92  * This method works as follows:
  93  *
  94  * When a request is committed, its commands (the BB start and any leading or
  95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
  96  * for the appropriate context. The tail pointer in the hardware context is not
  97  * updated at this time, but instead, kept by the driver in the ringbuffer
  98  * structure. A structure representing this request is added to a request queue
  99  * for the appropriate engine: this structure contains a copy of the context's
 100  * tail after the request was written to the ring buffer and a pointer to the
 101  * context itself.
 102  *
 103  * If the engine's request queue was empty before the request was added, the
 104  * queue is processed immediately. Otherwise the queue will be processed during
 105  * a context switch interrupt. In any case, elements on the queue will get sent
 106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
 107  * globally unique 20-bits submission ID.
 108  *
 109  * When execution of a request completes, the GPU updates the context status
 110  * buffer with a context complete event and generates a context switch interrupt.
 111  * During the interrupt handling, the driver examines the events in the buffer:
 112  * for each context complete event, if the announced ID matches that on the head
 113  * of the request queue, then that request is retired and removed from the queue.
 114  *
 115  * After processing, if any requests were retired and the queue is not empty
 116  * then a new execution list can be submitted. The two requests at the front of
 117  * the queue are next to be submitted but since a context may not occur twice in
 118  * an execution list, if subsequent requests have the same ID as the first then
 119  * the two requests must be combined. This is done simply by discarding requests
 120  * at the head of the queue until either only one requests is left (in which case
 121  * we use a NULL second context) or the first two requests have unique IDs.
 122  *
 123  * By always executing the first two requests in the queue the driver ensures
 124  * that the GPU is kept as busy as possible. In the case where a single context
 125  * completes but a second context is still executing, the request for this second
 126  * context will be at the head of the queue when we remove the first one. This
 127  * request will then be resubmitted along with a new request for a different context,
 128  * which will cause the hardware to continue executing the second request and queue
 129  * the new request (the GPU detects the condition of a context getting preempted
 130  * with the same context and optimizes the context switch flow by not doing
 131  * preemption, but just sampling the new tail pointer).
 132  *
 133  */
 134 #include <linux/interrupt.h>
 135
 136 #include "i915_drv.h"
 137 #include "i915_perf.h"
 138 #include "i915_trace.h"
 139 #include "i915_vgpu.h"
 140 #include "intel_context.h"
 141 #include "intel_engine_pm.h"
 142 #include "intel_gt.h"
 143 #include "intel_gt_pm.h"
 144 #include "intel_gt_requests.h"
 145 #include "intel_lrc_reg.h"
 146 #include "intel_mocs.h"
 147 #include "intel_reset.h"
 148 #include "intel_ring.h"
 149 #include "intel_workarounds.h"
 150 #include "shmem_utils.h"
 151
 152 #define RING_EXECLIST_QFULL             (1 << 0x2)
 153 #define RING_EXECLIST1_VALID            (1 << 0x3)
 154 #define RING_EXECLIST0_VALID            (1 << 0x4)
 155 #define RING_EXECLIST_ACTIVE_STATUS     (3 << 0xE)
 156 #define RING_EXECLIST1_ACTIVE           (1 << 0x11)
 157 #define RING_EXECLIST0_ACTIVE           (1 << 0x12)
 158
 159 #define GEN8_CTX_STATUS_IDLE_ACTIVE     (1 << 0)
 160 #define GEN8_CTX_STATUS_PREEMPTED       (1 << 1)
 161 #define GEN8_CTX_STATUS_ELEMENT_SWITCH  (1 << 2)
 162 #define GEN8_CTX_STATUS_ACTIVE_IDLE     (1 << 3)
 163 #define GEN8_CTX_STATUS_COMPLETE        (1 << 4)
 164 #define GEN8_CTX_STATUS_LITE_RESTORE    (1 << 15)
 165
 166 #define GEN8_CTX_STATUS_COMPLETED_MASK \
 167          (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
 168
 169 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
 170
 171 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE  (0x1) /* lower csb dword */
 172 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */
 173 #define GEN12_CSB_SW_CTX_ID_MASK                GENMASK(25, 15)
 174 #define GEN12_IDLE_CTX_ID               0x7FF
 175 #define GEN12_CSB_CTX_VALID(csb_dw) \
 176         (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
 177
 178 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
 179 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
 180
 181 struct virtual_engine {
 182         struct intel_engine_cs base;
 183         struct intel_context context;
 184
 185         /*
 186          * We allow only a single request through the virtual engine at a time
 187          * (each request in the timeline waits for the completion fence of
 188          * the previous before being submitted). By restricting ourselves to
 189          * only submitting a single request, each request is placed on to a
 190          * physical to maximise load spreading (by virtue of the late greedy
 191          * scheduling -- each real engine takes the next available request
 192          * upon idling).
 193          */
 194         struct i915_request *request;
 195
 196         /*
 197          * We keep a rbtree of available virtual engines inside each physical
 198          * engine, sorted by priority. Here we preallocate the nodes we need
 199          * for the virtual engine, indexed by physical_engine->id.
 200          */
 201         struct ve_node {
 202                 struct rb_node rb;
 203                 int prio;
 204         } nodes[I915_NUM_ENGINES];
 205
 206         /*
 207          * Keep track of bonded pairs -- restrictions upon on our selection
 208          * of physical engines any particular request may be submitted to.
 209          * If we receive a submit-fence from a master engine, we will only
 210          * use one of sibling_mask physical engines.
 211          */
 212         struct ve_bond {
 213                 const struct intel_engine_cs *master;
 214                 intel_engine_mask_t sibling_mask;
 215         } *bonds;
 216         unsigned int num_bonds;
 217
 218         /* And finally, which physical engines this virtual engine maps onto. */
 219         unsigned int num_siblings;
 220         struct intel_engine_cs *siblings[];
 221 };
 222
 223 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
 224 {
 225         GEM_BUG_ON(!intel_engine_is_virtual(engine));
 226         return container_of(engine, struct virtual_engine, base);
 227 }
 228
 229 static int __execlists_context_alloc(struct intel_context *ce,
 230                                      struct intel_engine_cs *engine);
 231
 232 static void execlists_init_reg_state(u32 *reg_state,
 233                                      const struct intel_context *ce,
 234                                      const struct intel_engine_cs *engine,
 235                                      const struct intel_ring *ring,
 236                                      bool close);
 237 static void
 238 __execlists_update_reg_state(const struct intel_context *ce,
 239                              const struct intel_engine_cs *engine,
 240                              u32 head);
 241
 242 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
 243 {
 244         if (INTEL_GEN(engine->i915) >= 12)
 245                 return 0x60;
 246         else if (INTEL_GEN(engine->i915) >= 9)
 247                 return 0x54;
 248         else if (engine->class == RENDER_CLASS)
 249                 return 0x58;
 250         else
 251                 return -1;
 252 }
 253
 254 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
 255 {
 256         if (INTEL_GEN(engine->i915) >= 12)
 257                 return 0x74;
 258         else if (INTEL_GEN(engine->i915) >= 9)
 259                 return 0x68;
 260         else if (engine->class == RENDER_CLASS)
 261                 return 0xd8;
 262         else
 263                 return -1;
 264 }
 265
 266 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
 267 {
 268         if (INTEL_GEN(engine->i915) >= 12)
 269                 return 0x12;
 270         else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS)
 271                 return 0x18;
 272         else
 273                 return -1;
 274 }
 275
 276 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
 277 {
 278         int x;
 279
 280         x = lrc_ring_wa_bb_per_ctx(engine);
 281         if (x < 0)
 282                 return x;
 283
 284         return x + 2;
 285 }
 286
 287 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
 288 {
 289         int x;
 290
 291         x = lrc_ring_indirect_ptr(engine);
 292         if (x < 0)
 293                 return x;
 294
 295         return x + 2;
 296 }
 297
 298 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
 299 {
 300         if (engine->class != RENDER_CLASS)
 301                 return -1;
 302
 303         if (INTEL_GEN(engine->i915) >= 12)
 304                 return 0xb6;
 305         else if (INTEL_GEN(engine->i915) >= 11)
 306                 return 0xaa;
 307         else
 308                 return -1;
 309 }
 310
 311 static u32
 312 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
 313 {
 314         switch (INTEL_GEN(engine->i915)) {
 315         default:
 316                 MISSING_CASE(INTEL_GEN(engine->i915));
 317                 fallthrough;
 318         case 12:
 319                 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 320         case 11:
 321                 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 322         case 10:
 323                 return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 324         case 9:
 325                 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 326         case 8:
 327                 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 328         }
 329 }
 330
 331 static void
 332 lrc_ring_setup_indirect_ctx(u32 *regs,
 333                             const struct intel_engine_cs *engine,
 334                             u32 ctx_bb_ggtt_addr,
 335                             u32 size)
 336 {
 337         GEM_BUG_ON(!size);
 338         GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
 339         GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
 340         regs[lrc_ring_indirect_ptr(engine) + 1] =
 341                 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
 342
 343         GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
 344         regs[lrc_ring_indirect_offset(engine) + 1] =
 345                 lrc_ring_indirect_offset_default(engine) << 6;
 346 }
 347
 348 static u32 intel_context_get_runtime(const struct intel_context *ce)
 349 {
 350         /*
 351          * We can use either ppHWSP[16] which is recorded before the context
 352          * switch (and so excludes the cost of context switches) or use the
 353          * value from the context image itself, which is saved/restored earlier
 354          * and so includes the cost of the save.
 355          */
 356         return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
 357 }
 358
 359 static void mark_eio(struct i915_request *rq)
 360 {
 361         if (i915_request_completed(rq))
 362                 return;
 363
 364         GEM_BUG_ON(i915_request_signaled(rq));
 365
 366         i915_request_set_error_once(rq, -EIO);
 367         i915_request_mark_complete(rq);
 368 }
 369
 370 static struct i915_request *
 371 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
 372 {
 373         struct i915_request *active = rq;
 374
 375         rcu_read_lock();
 376         list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
 377                 if (i915_request_completed(rq))
 378                         break;
 379
 380                 active = rq;
 381         }
 382         rcu_read_unlock();
 383
 384         return active;
 385 }
 386
 387 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
 388 {
 389         return (i915_ggtt_offset(engine->status_page.vma) +
 390                 I915_GEM_HWS_PREEMPT_ADDR);
 391 }
 392
 393 static inline void
 394 ring_set_paused(const struct intel_engine_cs *engine, int state)
 395 {
 396         /*
 397          * We inspect HWS_PREEMPT with a semaphore inside
 398          * engine->emit_fini_breadcrumb. If the dword is true,
 399          * the ring is paused as the semaphore will busywait
 400          * until the dword is false.
 401          */
 402         engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
 403         if (state)
 404                 wmb();
 405 }
 406
 407 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
 408 {
 409         return rb_entry(rb, struct i915_priolist, node);
 410 }
 411
 412 static inline int rq_prio(const struct i915_request *rq)
 413 {
 414         return READ_ONCE(rq->sched.attr.priority);
 415 }
 416
 417 static int effective_prio(const struct i915_request *rq)
 418 {
 419         int prio = rq_prio(rq);
 420
 421         /*
 422          * If this request is special and must not be interrupted at any
 423          * cost, so be it. Note we are only checking the most recent request
 424          * in the context and so may be masking an earlier vip request. It
 425          * is hoped that under the conditions where nopreempt is used, this
 426          * will not matter (i.e. all requests to that context will be
 427          * nopreempt for as long as desired).
 428          */
 429         if (i915_request_has_nopreempt(rq))
 430                 prio = I915_PRIORITY_UNPREEMPTABLE;
 431
 432         return prio;
 433 }
 434
 435 static int queue_prio(const struct intel_engine_execlists *execlists)
 436 {
 437         struct i915_priolist *p;
 438         struct rb_node *rb;
 439
 440         rb = rb_first_cached(&execlists->queue);
 441         if (!rb)
 442                 return INT_MIN;
 443
 444         /*
 445          * As the priolist[] are inverted, with the highest priority in [0],
 446          * we have to flip the index value to become priority.
 447          */
 448         p = to_priolist(rb);
 449         if (!I915_USER_PRIORITY_SHIFT)
 450                 return p->priority;
 451
 452         return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
 453 }
 454
 455 static inline bool need_preempt(const struct intel_engine_cs *engine,
 456                                 const struct i915_request *rq,
 457                                 struct rb_node *rb)
 458 {
 459         int last_prio;
 460
 461         if (!intel_engine_has_semaphores(engine))
 462                 return false;
 463
 464         /*
 465          * Check if the current priority hint merits a preemption attempt.
 466          *
 467          * We record the highest value priority we saw during rescheduling
 468          * prior to this dequeue, therefore we know that if it is strictly
 469          * less than the current tail of ESLP[0], we do not need to force
 470          * a preempt-to-idle cycle.
 471          *
 472          * However, the priority hint is a mere hint that we may need to
 473          * preempt. If that hint is stale or we may be trying to preempt
 474          * ourselves, ignore the request.
 475          *
 476          * More naturally we would write
 477          *      prio >= max(0, last);
 478          * except that we wish to prevent triggering preemption at the same
 479          * priority level: the task that is running should remain running
 480          * to preserve FIFO ordering of dependencies.
 481          */
 482         last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
 483         if (engine->execlists.queue_priority_hint <= last_prio)
 484                 return false;
 485
 486         /*
 487          * Check against the first request in ELSP[1], it will, thanks to the
 488          * power of PI, be the highest priority of that context.
 489          */
 490         if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
 491             rq_prio(list_next_entry(rq, sched.link)) > last_prio)
 492                 return true;
 493
 494         if (rb) {
 495                 struct virtual_engine *ve =
 496                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
 497                 bool preempt = false;
 498
 499                 if (engine == ve->siblings[0]) { /* only preempt one sibling */
 500                         struct i915_request *next;
 501
 502                         rcu_read_lock();
 503                         next = READ_ONCE(ve->request);
 504                         if (next)
 505                                 preempt = rq_prio(next) > last_prio;
 506                         rcu_read_unlock();
 507                 }
 508
 509                 if (preempt)
 510                         return preempt;
 511         }
 512
 513         /*
 514          * If the inflight context did not trigger the preemption, then maybe
 515          * it was the set of queued requests? Pick the highest priority in
 516          * the queue (the first active priolist) and see if it deserves to be
 517          * running instead of ELSP[0].
 518          *
 519          * The highest priority request in the queue can not be either
 520          * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
 521          * context, it's priority would not exceed ELSP[0] aka last_prio.
 522          */
 523         return queue_prio(&engine->execlists) > last_prio;
 524 }
 525
 526 __maybe_unused static inline bool
 527 assert_priority_queue(const struct i915_request *prev,
 528                       const struct i915_request *next)
 529 {
 530         /*
 531          * Without preemption, the prev may refer to the still active element
 532          * which we refuse to let go.
 533          *
 534          * Even with preemption, there are times when we think it is better not
 535          * to preempt and leave an ostensibly lower priority request in flight.
 536          */
 537         if (i915_request_is_active(prev))
 538                 return true;
 539
 540         return rq_prio(prev) >= rq_prio(next);
 541 }
 542
 543 /*
 544  * The context descriptor encodes various attributes of a context,
 545  * including its GTT address and some flags. Because it's fairly
 546  * expensive to calculate, we'll just do it once and cache the result,
 547  * which remains valid until the context is unpinned.
 548  *
 549  * This is what a descriptor looks like, from LSB to MSB::
 550  *
 551  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
 552  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
 553  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
 554  *      bits 53-54:    mbz, reserved for use by hardware
 555  *      bits 55-63:    group ID, currently unused and set to 0
 556  *
 557  * Starting from Gen11, the upper dword of the descriptor has a new format:
 558  *
 559  *      bits 32-36:    reserved
 560  *      bits 37-47:    SW context ID
 561  *      bits 48:53:    engine instance
 562  *      bit 54:        mbz, reserved for use by hardware
 563  *      bits 55-60:    SW counter
 564  *      bits 61-63:    engine class
 565  *
 566  * engine info, SW context ID and SW counter need to form a unique number
 567  * (Context ID) per lrc.
 568  */
 569 static u32
 570 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
 571 {
 572         u32 desc;
 573
 574         desc = INTEL_LEGACY_32B_CONTEXT;
 575         if (i915_vm_is_4lvl(ce->vm))
 576                 desc = INTEL_LEGACY_64B_CONTEXT;
 577         desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
 578
 579         desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
 580         if (IS_GEN(engine->i915, 8))
 581                 desc |= GEN8_CTX_L3LLC_COHERENT;
 582
 583         return i915_ggtt_offset(ce->state) | desc;
 584 }
 585
 586 static inline unsigned int dword_in_page(void *addr)
 587 {
 588         return offset_in_page(addr) / sizeof(u32);
 589 }
 590
 591 static void set_offsets(u32 *regs,
 592                         const u8 *data,
 593                         const struct intel_engine_cs *engine,
 594                         bool clear)
 595 #define NOP(x) (BIT(7) | (x))
 596 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
 597 #define POSTED BIT(0)
 598 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
 599 #define REG16(x) \
 600         (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
 601         (((x) >> 2) & 0x7f)
 602 #define END(total_state_size) 0, (total_state_size)
 603 {
 604         const u32 base = engine->mmio_base;
 605
 606         while (*data) {
 607                 u8 count, flags;
 608
 609                 if (*data & BIT(7)) { /* skip */
 610                         count = *data++ & ~BIT(7);
 611                         if (clear)
 612                                 memset32(regs, MI_NOOP, count);
 613                         regs += count;
 614                         continue;
 615                 }
 616
 617                 count = *data & 0x3f;
 618                 flags = *data >> 6;
 619                 data++;
 620
 621                 *regs = MI_LOAD_REGISTER_IMM(count);
 622                 if (flags & POSTED)
 623                         *regs |= MI_LRI_FORCE_POSTED;
 624                 if (INTEL_GEN(engine->i915) >= 11)
 625                         *regs |= MI_LRI_LRM_CS_MMIO;
 626                 regs++;
 627
 628                 GEM_BUG_ON(!count);
 629                 do {
 630                         u32 offset = 0;
 631                         u8 v;
 632
 633                         do {
 634                                 v = *data++;
 635                                 offset <<= 7;
 636                                 offset |= v & ~BIT(7);
 637                         } while (v & BIT(7));
 638
 639                         regs[0] = base + (offset << 2);
 640                         if (clear)
 641                                 regs[1] = 0;
 642                         regs += 2;
 643                 } while (--count);
 644         }
 645
 646         if (clear) {
 647                 u8 count = *++data;
 648
 649                 /* Clear past the tail for HW access */
 650                 GEM_BUG_ON(dword_in_page(regs) > count);
 651                 memset32(regs, MI_NOOP, count - dword_in_page(regs));
 652
 653                 /* Close the batch; used mainly by live_lrc_layout() */
 654                 *regs = MI_BATCH_BUFFER_END;
 655                 if (INTEL_GEN(engine->i915) >= 10)
 656                         *regs |= BIT(0);
 657         }
 658 }
 659
 660 static const u8 gen8_xcs_offsets[] = {
 661         NOP(1),
 662         LRI(11, 0),
 663         REG16(0x244),
 664         REG(0x034),
 665         REG(0x030),
 666         REG(0x038),
 667         REG(0x03c),
 668         REG(0x168),
 669         REG(0x140),
 670         REG(0x110),
 671         REG(0x11c),
 672         REG(0x114),
 673         REG(0x118),
 674
 675         NOP(9),
 676         LRI(9, 0),
 677         REG16(0x3a8),
 678         REG16(0x28c),
 679         REG16(0x288),
 680         REG16(0x284),
 681         REG16(0x280),
 682         REG16(0x27c),
 683         REG16(0x278),
 684         REG16(0x274),
 685         REG16(0x270),
 686
 687         NOP(13),
 688         LRI(2, 0),
 689         REG16(0x200),
 690         REG(0x028),
 691
 692         END(80)
 693 };
 694
 695 static const u8 gen9_xcs_offsets[] = {
 696         NOP(1),
 697         LRI(14, POSTED),
 698         REG16(0x244),
 699         REG(0x034),
 700         REG(0x030),
 701         REG(0x038),
 702         REG(0x03c),
 703         REG(0x168),
 704         REG(0x140),
 705         REG(0x110),
 706         REG(0x11c),
 707         REG(0x114),
 708         REG(0x118),
 709         REG(0x1c0),
 710         REG(0x1c4),
 711         REG(0x1c8),
 712
 713         NOP(3),
 714         LRI(9, POSTED),
 715         REG16(0x3a8),
 716         REG16(0x28c),
 717         REG16(0x288),
 718         REG16(0x284),
 719         REG16(0x280),
 720         REG16(0x27c),
 721         REG16(0x278),
 722         REG16(0x274),
 723         REG16(0x270),
 724
 725         NOP(13),
 726         LRI(1, POSTED),
 727         REG16(0x200),
 728
 729         NOP(13),
 730         LRI(44, POSTED),
 731         REG(0x028),
 732         REG(0x09c),
 733         REG(0x0c0),
 734         REG(0x178),
 735         REG(0x17c),
 736         REG16(0x358),
 737         REG(0x170),
 738         REG(0x150),
 739         REG(0x154),
 740         REG(0x158),
 741         REG16(0x41c),
 742         REG16(0x600),
 743         REG16(0x604),
 744         REG16(0x608),
 745         REG16(0x60c),
 746         REG16(0x610),
 747         REG16(0x614),
 748         REG16(0x618),
 749         REG16(0x61c),
 750         REG16(0x620),
 751         REG16(0x624),
 752         REG16(0x628),
 753         REG16(0x62c),
 754         REG16(0x630),
 755         REG16(0x634),
 756         REG16(0x638),
 757         REG16(0x63c),
 758         REG16(0x640),
 759         REG16(0x644),
 760         REG16(0x648),
 761         REG16(0x64c),
 762         REG16(0x650),
 763         REG16(0x654),
 764         REG16(0x658),
 765         REG16(0x65c),
 766         REG16(0x660),
 767         REG16(0x664),
 768         REG16(0x668),
 769         REG16(0x66c),
 770         REG16(0x670),
 771         REG16(0x674),
 772         REG16(0x678),
 773         REG16(0x67c),
 774         REG(0x068),
 775
 776         END(176)
 777 };
 778
 779 static const u8 gen12_xcs_offsets[] = {
 780         NOP(1),
 781         LRI(13, POSTED),
 782         REG16(0x244),
 783         REG(0x034),
 784         REG(0x030),
 785         REG(0x038),
 786         REG(0x03c),
 787         REG(0x168),
 788         REG(0x140),
 789         REG(0x110),
 790         REG(0x1c0),
 791         REG(0x1c4),
 792         REG(0x1c8),
 793         REG(0x180),
 794         REG16(0x2b4),
 795
 796         NOP(5),
 797         LRI(9, POSTED),
 798         REG16(0x3a8),
 799         REG16(0x28c),
 800         REG16(0x288),
 801         REG16(0x284),
 802         REG16(0x280),
 803         REG16(0x27c),
 804         REG16(0x278),
 805         REG16(0x274),
 806         REG16(0x270),
 807
 808         END(80)
 809 };
 810
 811 static const u8 gen8_rcs_offsets[] = {
 812         NOP(1),
 813         LRI(14, POSTED),
 814         REG16(0x244),
 815         REG(0x034),
 816         REG(0x030),
 817         REG(0x038),
 818         REG(0x03c),
 819         REG(0x168),
 820         REG(0x140),
 821         REG(0x110),
 822         REG(0x11c),
 823         REG(0x114),
 824         REG(0x118),
 825         REG(0x1c0),
 826         REG(0x1c4),
 827         REG(0x1c8),
 828
 829         NOP(3),
 830         LRI(9, POSTED),
 831         REG16(0x3a8),
 832         REG16(0x28c),
 833         REG16(0x288),
 834         REG16(0x284),
 835         REG16(0x280),
 836         REG16(0x27c),
 837         REG16(0x278),
 838         REG16(0x274),
 839         REG16(0x270),
 840
 841         NOP(13),
 842         LRI(1, 0),
 843         REG(0x0c8),
 844
 845         END(80)
 846 };
 847
 848 static const u8 gen9_rcs_offsets[] = {
 849         NOP(1),
 850         LRI(14, POSTED),
 851         REG16(0x244),
 852         REG(0x34),
 853         REG(0x30),
 854         REG(0x38),
 855         REG(0x3c),
 856         REG(0x168),
 857         REG(0x140),
 858         REG(0x110),
 859         REG(0x11c),
 860         REG(0x114),
 861         REG(0x118),
 862         REG(0x1c0),
 863         REG(0x1c4),
 864         REG(0x1c8),
 865
 866         NOP(3),
 867         LRI(9, POSTED),
 868         REG16(0x3a8),
 869         REG16(0x28c),
 870         REG16(0x288),
 871         REG16(0x284),
 872         REG16(0x280),
 873         REG16(0x27c),
 874         REG16(0x278),
 875         REG16(0x274),
 876         REG16(0x270),
 877
 878         NOP(13),
 879         LRI(1, 0),
 880         REG(0xc8),
 881
 882         NOP(13),
 883         LRI(44, POSTED),
 884         REG(0x28),
 885         REG(0x9c),
 886         REG(0xc0),
 887         REG(0x178),
 888         REG(0x17c),
 889         REG16(0x358),
 890         REG(0x170),
 891         REG(0x150),
 892         REG(0x154),
 893         REG(0x158),
 894         REG16(0x41c),
 895         REG16(0x600),
 896         REG16(0x604),
 897         REG16(0x608),
 898         REG16(0x60c),
 899         REG16(0x610),
 900         REG16(0x614),
 901         REG16(0x618),
 902         REG16(0x61c),
 903         REG16(0x620),
 904         REG16(0x624),
 905         REG16(0x628),
 906         REG16(0x62c),
 907         REG16(0x630),
 908         REG16(0x634),
 909         REG16(0x638),
 910         REG16(0x63c),
 911         REG16(0x640),
 912         REG16(0x644),
 913         REG16(0x648),
 914         REG16(0x64c),
 915         REG16(0x650),
 916         REG16(0x654),
 917         REG16(0x658),
 918         REG16(0x65c),
 919         REG16(0x660),
 920         REG16(0x664),
 921         REG16(0x668),
 922         REG16(0x66c),
 923         REG16(0x670),
 924         REG16(0x674),
 925         REG16(0x678),
 926         REG16(0x67c),
 927         REG(0x68),
 928
 929         END(176)
 930 };
 931
 932 static const u8 gen11_rcs_offsets[] = {
 933         NOP(1),
 934         LRI(15, POSTED),
 935         REG16(0x244),
 936         REG(0x034),
 937         REG(0x030),
 938         REG(0x038),
 939         REG(0x03c),
 940         REG(0x168),
 941         REG(0x140),
 942         REG(0x110),
 943         REG(0x11c),
 944         REG(0x114),
 945         REG(0x118),
 946         REG(0x1c0),
 947         REG(0x1c4),
 948         REG(0x1c8),
 949         REG(0x180),
 950
 951         NOP(1),
 952         LRI(9, POSTED),
 953         REG16(0x3a8),
 954         REG16(0x28c),
 955         REG16(0x288),
 956         REG16(0x284),
 957         REG16(0x280),
 958         REG16(0x27c),
 959         REG16(0x278),
 960         REG16(0x274),
 961         REG16(0x270),
 962
 963         LRI(1, POSTED),
 964         REG(0x1b0),
 965
 966         NOP(10),
 967         LRI(1, 0),
 968         REG(0x0c8),
 969
 970         END(80)
 971 };
 972
 973 static const u8 gen12_rcs_offsets[] = {
 974         NOP(1),
 975         LRI(13, POSTED),
 976         REG16(0x244),
 977         REG(0x034),
 978         REG(0x030),
 979         REG(0x038),
 980         REG(0x03c),
 981         REG(0x168),
 982         REG(0x140),
 983         REG(0x110),
 984         REG(0x1c0),
 985         REG(0x1c4),
 986         REG(0x1c8),
 987         REG(0x180),
 988         REG16(0x2b4),
 989
 990         NOP(5),
 991         LRI(9, POSTED),
 992         REG16(0x3a8),
 993         REG16(0x28c),
 994         REG16(0x288),
 995         REG16(0x284),
 996         REG16(0x280),
 997         REG16(0x27c),
 998         REG16(0x278),
 999         REG16(0x274),
1000         REG16(0x270),
1001
1002         LRI(3, POSTED),
1003         REG(0x1b0),
1004         REG16(0x5a8),
1005         REG16(0x5ac),
1006
1007         NOP(6),
1008         LRI(1, 0),
1009         REG(0x0c8),
1010         NOP(3 + 9 + 1),
1011
1012         LRI(51, POSTED),
1013         REG16(0x588),
1014         REG16(0x588),
1015         REG16(0x588),
1016         REG16(0x588),
1017         REG16(0x588),
1018         REG16(0x588),
1019         REG(0x028),
1020         REG(0x09c),
1021         REG(0x0c0),
1022         REG(0x178),
1023         REG(0x17c),
1024         REG16(0x358),
1025         REG(0x170),
1026         REG(0x150),
1027         REG(0x154),
1028         REG(0x158),
1029         REG16(0x41c),
1030         REG16(0x600),
1031         REG16(0x604),
1032         REG16(0x608),
1033         REG16(0x60c),
1034         REG16(0x610),
1035         REG16(0x614),
1036         REG16(0x618),
1037         REG16(0x61c),
1038         REG16(0x620),
1039         REG16(0x624),
1040         REG16(0x628),
1041         REG16(0x62c),
1042         REG16(0x630),
1043         REG16(0x634),
1044         REG16(0x638),
1045         REG16(0x63c),
1046         REG16(0x640),
1047         REG16(0x644),
1048         REG16(0x648),
1049         REG16(0x64c),
1050         REG16(0x650),
1051         REG16(0x654),
1052         REG16(0x658),
1053         REG16(0x65c),
1054         REG16(0x660),
1055         REG16(0x664),
1056         REG16(0x668),
1057         REG16(0x66c),
1058         REG16(0x670),
1059         REG16(0x674),
1060         REG16(0x678),
1061         REG16(0x67c),
1062         REG(0x068),
1063         REG(0x084),
1064         NOP(1),
1065
1066         END(192)
1067 };
1068
1069 #undef END
1070 #undef REG16
1071 #undef REG
1072 #undef LRI
1073 #undef NOP
1074
1075 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
1076 {
1077         /*
1078          * The gen12+ lists only have the registers we program in the basic
1079          * default state. We rely on the context image using relative
1080          * addressing to automatic fixup the register state between the
1081          * physical engines for virtual engine.
1082          */
1083         GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
1084                    !intel_engine_has_relative_mmio(engine));
1085
1086         if (engine->class == RENDER_CLASS) {
1087                 if (INTEL_GEN(engine->i915) >= 12)
1088                         return gen12_rcs_offsets;
1089                 else if (INTEL_GEN(engine->i915) >= 11)
1090                         return gen11_rcs_offsets;
1091                 else if (INTEL_GEN(engine->i915) >= 9)
1092                         return gen9_rcs_offsets;
1093                 else
1094                         return gen8_rcs_offsets;
1095         } else {
1096                 if (INTEL_GEN(engine->i915) >= 12)
1097                         return gen12_xcs_offsets;
1098                 else if (INTEL_GEN(engine->i915) >= 9)
1099                         return gen9_xcs_offsets;
1100                 else
1101                         return gen8_xcs_offsets;
1102         }
1103 }
1104
1105 static struct i915_request *
1106 __unwind_incomplete_requests(struct intel_engine_cs *engine)
1107 {
1108         struct i915_request *rq, *rn, *active = NULL;
1109         struct list_head *pl;
1110         int prio = I915_PRIORITY_INVALID;
1111
1112         lockdep_assert_held(&engine->active.lock);
1113
1114         list_for_each_entry_safe_reverse(rq, rn,
1115                                          &engine->active.requests,
1116                                          sched.link) {
1117                 if (i915_request_completed(rq))
1118                         continue; /* XXX */
1119
1120                 __i915_request_unsubmit(rq);
1121
1122                 /*
1123                  * Push the request back into the queue for later resubmission.
1124                  * If this request is not native to this physical engine (i.e.
1125                  * it came from a virtual source), push it back onto the virtual
1126                  * engine so that it can be moved across onto another physical
1127                  * engine as load dictates.
1128                  */
1129                 if (likely(rq->execution_mask == engine->mask)) {
1130                         GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
1131                         if (rq_prio(rq) != prio) {
1132                                 prio = rq_prio(rq);
1133                                 pl = i915_sched_lookup_priolist(engine, prio);
1134                         }
1135                         GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
1136
1137                         list_move(&rq->sched.link, pl);
1138                         set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
1139
1140                         /* Check in case we rollback so far we wrap [size/2] */
1141                         if (intel_ring_direction(rq->ring,
1142                                                  rq->tail,
1143                                                  rq->ring->tail + 8) > 0)
1144                                 rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1145
1146                         active = rq;
1147                 } else {
1148                         struct intel_engine_cs *owner = rq->context->engine;
1149
1150                         /*
1151                          * Decouple the virtual breadcrumb before moving it
1152                          * back to the virtual engine -- we don't want the
1153                          * request to complete in the background and try
1154                          * and cancel the breadcrumb on the virtual engine
1155                          * (instead of the old engine where it is linked)!
1156                          */
1157                         if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
1158                                      &rq->fence.flags)) {
1159                                 spin_lock_nested(&rq->lock,
1160                                                  SINGLE_DEPTH_NESTING);
1161                                 i915_request_cancel_breadcrumb(rq);
1162                                 spin_unlock(&rq->lock);
1163                         }
1164                         WRITE_ONCE(rq->engine, owner);
1165                         owner->submit_request(rq);
1166                         active = NULL;
1167                 }
1168         }
1169
1170         return active;
1171 }
1172
1173 struct i915_request *
1174 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1175 {
1176         struct intel_engine_cs *engine =
1177                 container_of(execlists, typeof(*engine), execlists);
1178
1179         return __unwind_incomplete_requests(engine);
1180 }
1181
1182 static inline void
1183 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1184 {
1185         /*
1186          * Only used when GVT-g is enabled now. When GVT-g is disabled,
1187          * The compiler should eliminate this function as dead-code.
1188          */
1189         if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1190                 return;
1191
1192         atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1193                                    status, rq);
1194 }
1195
1196 static void intel_engine_context_in(struct intel_engine_cs *engine)
1197 {
1198         unsigned long flags;
1199
1200         if (atomic_add_unless(&engine->stats.active, 1, 0))
1201                 return;
1202
1203         write_seqlock_irqsave(&engine->stats.lock, flags);
1204         if (!atomic_add_unless(&engine->stats.active, 1, 0)) {
1205                 engine->stats.start = ktime_get();
1206                 atomic_inc(&engine->stats.active);
1207         }
1208         write_sequnlock_irqrestore(&engine->stats.lock, flags);
1209 }
1210
1211 static void intel_engine_context_out(struct intel_engine_cs *engine)
1212 {
1213         unsigned long flags;
1214
1215         GEM_BUG_ON(!atomic_read(&engine->stats.active));
1216
1217         if (atomic_add_unless(&engine->stats.active, -1, 1))
1218                 return;
1219
1220         write_seqlock_irqsave(&engine->stats.lock, flags);
1221         if (atomic_dec_and_test(&engine->stats.active)) {
1222                 engine->stats.total =
1223                         ktime_add(engine->stats.total,
1224                                   ktime_sub(ktime_get(), engine->stats.start));
1225         }
1226         write_sequnlock_irqrestore(&engine->stats.lock, flags);
1227 }
1228
1229 static void
1230 execlists_check_context(const struct intel_context *ce,
1231                         const struct intel_engine_cs *engine)
1232 {
1233         const struct intel_ring *ring = ce->ring;
1234         u32 *regs = ce->lrc_reg_state;
1235         bool valid = true;
1236         int x;
1237
1238         if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1239                 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1240                        engine->name,
1241                        regs[CTX_RING_START],
1242                        i915_ggtt_offset(ring->vma));
1243                 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1244                 valid = false;
1245         }
1246
1247         if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1248             (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1249                 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1250                        engine->name,
1251                        regs[CTX_RING_CTL],
1252                        (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1253                 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1254                 valid = false;
1255         }
1256
1257         x = lrc_ring_mi_mode(engine);
1258         if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1259                 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1260                        engine->name, regs[x + 1]);
1261                 regs[x + 1] &= ~STOP_RING;
1262                 regs[x + 1] |= STOP_RING << 16;
1263                 valid = false;
1264         }
1265
1266         WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1267 }
1268
1269 static void restore_default_state(struct intel_context *ce,
1270                                   struct intel_engine_cs *engine)
1271 {
1272         u32 *regs;
1273
1274         regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE);
1275         execlists_init_reg_state(regs, ce, engine, ce->ring, true);
1276
1277         ce->runtime.last = intel_context_get_runtime(ce);
1278 }
1279
1280 static void reset_active(struct i915_request *rq,
1281                          struct intel_engine_cs *engine)
1282 {
1283         struct intel_context * const ce = rq->context;
1284         u32 head;
1285
1286         /*
1287          * The executing context has been cancelled. We want to prevent
1288          * further execution along this context and propagate the error on
1289          * to anything depending on its results.
1290          *
1291          * In __i915_request_submit(), we apply the -EIO and remove the
1292          * requests' payloads for any banned requests. But first, we must
1293          * rewind the context back to the start of the incomplete request so
1294          * that we do not jump back into the middle of the batch.
1295          *
1296          * We preserve the breadcrumbs and semaphores of the incomplete
1297          * requests so that inter-timeline dependencies (i.e other timelines)
1298          * remain correctly ordered. And we defer to __i915_request_submit()
1299          * so that all asynchronous waits are correctly handled.
1300          */
1301         ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1302                      rq->fence.context, rq->fence.seqno);
1303
1304         /* On resubmission of the active request, payload will be scrubbed */
1305         if (i915_request_completed(rq))
1306                 head = rq->tail;
1307         else
1308                 head = active_request(ce->timeline, rq)->head;
1309         head = intel_ring_wrap(ce->ring, head);
1310
1311         /* Scrub the context image to prevent replaying the previous batch */
1312         restore_default_state(ce, engine);
1313         __execlists_update_reg_state(ce, engine, head);
1314
1315         /* We've switched away, so this should be a no-op, but intent matters */
1316         ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1317 }
1318
1319 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1320 {
1321 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1322         ce->runtime.num_underflow += dt < 0;
1323         ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1324 #endif
1325 }
1326
1327 static void intel_context_update_runtime(struct intel_context *ce)
1328 {
1329         u32 old;
1330         s32 dt;
1331
1332         if (intel_context_is_barrier(ce))
1333                 return;
1334
1335         old = ce->runtime.last;
1336         ce->runtime.last = intel_context_get_runtime(ce);
1337         dt = ce->runtime.last - old;
1338
1339         if (unlikely(dt <= 0)) {
1340                 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1341                          old, ce->runtime.last, dt);
1342                 st_update_runtime_underflow(ce, dt);
1343                 return;
1344         }
1345
1346         ewma_runtime_add(&ce->runtime.avg, dt);
1347         ce->runtime.total += dt;
1348 }
1349
1350 static inline struct intel_engine_cs *
1351 __execlists_schedule_in(struct i915_request *rq)
1352 {
1353         struct intel_engine_cs * const engine = rq->engine;
1354         struct intel_context * const ce = rq->context;
1355
1356         intel_context_get(ce);
1357
1358         if (unlikely(intel_context_is_banned(ce)))
1359                 reset_active(rq, engine);
1360
1361         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1362                 execlists_check_context(ce, engine);
1363
1364         if (ce->tag) {
1365                 /* Use a fixed tag for OA and friends */
1366                 GEM_BUG_ON(ce->tag <= BITS_PER_LONG);
1367                 ce->lrc.ccid = ce->tag;
1368         } else {
1369                 /* We don't need a strict matching tag, just different values */
1370                 unsigned int tag = ffs(READ_ONCE(engine->context_tag));
1371
1372                 GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG);
1373                 clear_bit(tag - 1, &engine->context_tag);
1374                 ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32);
1375
1376                 BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID);
1377         }
1378
1379         ce->lrc.ccid |= engine->execlists.ccid;
1380
1381         __intel_gt_pm_get(engine->gt);
1382         if (engine->fw_domain && !atomic_fetch_inc(&engine->fw_active))
1383                 intel_uncore_forcewake_get(engine->uncore, engine->fw_domain);
1384         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1385         intel_engine_context_in(engine);
1386
1387         return engine;
1388 }
1389
1390 static inline struct i915_request *
1391 execlists_schedule_in(struct i915_request *rq, int idx)
1392 {
1393         struct intel_context * const ce = rq->context;
1394         struct intel_engine_cs *old;
1395
1396         GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1397         trace_i915_request_in(rq, idx);
1398
1399         old = READ_ONCE(ce->inflight);
1400         do {
1401                 if (!old) {
1402                         WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1403                         break;
1404                 }
1405         } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1406
1407         GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1408         return i915_request_get(rq);
1409 }
1410
1411 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1412 {
1413         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1414         struct i915_request *next = READ_ONCE(ve->request);
1415
1416         if (next == rq || (next && next->execution_mask & ~rq->execution_mask))
1417                 tasklet_hi_schedule(&ve->base.execlists.tasklet);
1418 }
1419
1420 static inline void
1421 __execlists_schedule_out(struct i915_request *rq,
1422                          struct intel_engine_cs * const engine,
1423                          unsigned int ccid)
1424 {
1425         struct intel_context * const ce = rq->context;
1426
1427         /*
1428          * NB process_csb() is not under the engine->active.lock and hence
1429          * schedule_out can race with schedule_in meaning that we should
1430          * refrain from doing non-trivial work here.
1431          */
1432
1433         /*
1434          * If we have just completed this context, the engine may now be
1435          * idle and we want to re-enter powersaving.
1436          */
1437         if (list_is_last_rcu(&rq->link, &ce->timeline->requests) &&
1438             i915_request_completed(rq))
1439                 intel_engine_add_retire(engine, ce->timeline);
1440
1441         ccid >>= GEN11_SW_CTX_ID_SHIFT - 32;
1442         ccid &= GEN12_MAX_CONTEXT_HW_ID;
1443         if (ccid < BITS_PER_LONG) {
1444                 GEM_BUG_ON(ccid == 0);
1445                 GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag));
1446                 set_bit(ccid - 1, &engine->context_tag);
1447         }
1448
1449         intel_context_update_runtime(ce);
1450         intel_engine_context_out(engine);
1451         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1452         if (engine->fw_domain && !atomic_dec_return(&engine->fw_active))
1453                 intel_uncore_forcewake_put(engine->uncore, engine->fw_domain);
1454         intel_gt_pm_put_async(engine->gt);
1455
1456         /*
1457          * If this is part of a virtual engine, its next request may
1458          * have been blocked waiting for access to the active context.
1459          * We have to kick all the siblings again in case we need to
1460          * switch (e.g. the next request is not runnable on this
1461          * engine). Hopefully, we will already have submitted the next
1462          * request before the tasklet runs and do not need to rebuild
1463          * each virtual tree and kick everyone again.
1464          */
1465         if (ce->engine != engine)
1466                 kick_siblings(rq, ce);
1467
1468         intel_context_put(ce);
1469 }
1470
1471 static inline void
1472 execlists_schedule_out(struct i915_request *rq)
1473 {
1474         struct intel_context * const ce = rq->context;
1475         struct intel_engine_cs *cur, *old;
1476         u32 ccid;
1477
1478         trace_i915_request_out(rq);
1479
1480         ccid = rq->context->lrc.ccid;
1481         old = READ_ONCE(ce->inflight);
1482         do
1483                 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1484         while (!try_cmpxchg(&ce->inflight, &old, cur));
1485         if (!cur)
1486                 __execlists_schedule_out(rq, old, ccid);
1487
1488         i915_request_put(rq);
1489 }
1490
1491 static u64 execlists_update_context(struct i915_request *rq)
1492 {
1493         struct intel_context *ce = rq->context;
1494         u64 desc = ce->lrc.desc;
1495         u32 tail, prev;
1496
1497         /*
1498          * WaIdleLiteRestore:bdw,skl
1499          *
1500          * We should never submit the context with the same RING_TAIL twice
1501          * just in case we submit an empty ring, which confuses the HW.
1502          *
1503          * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1504          * the normal request to be able to always advance the RING_TAIL on
1505          * subsequent resubmissions (for lite restore). Should that fail us,
1506          * and we try and submit the same tail again, force the context
1507          * reload.
1508          *
1509          * If we need to return to a preempted context, we need to skip the
1510          * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1511          * HW has a tendency to ignore us rewinding the TAIL to the end of
1512          * an earlier request.
1513          */
1514         GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail);
1515         prev = rq->ring->tail;
1516         tail = intel_ring_set_tail(rq->ring, rq->tail);
1517         if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1518                 desc |= CTX_DESC_FORCE_RESTORE;
1519         ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1520         rq->tail = rq->wa_tail;
1521
1522         /*
1523          * Make sure the context image is complete before we submit it to HW.
1524          *
1525          * Ostensibly, writes (including the WCB) should be flushed prior to
1526          * an uncached write such as our mmio register access, the empirical
1527          * evidence (esp. on Braswell) suggests that the WC write into memory
1528          * may not be visible to the HW prior to the completion of the UC
1529          * register write and that we may begin execution from the context
1530          * before its image is complete leading to invalid PD chasing.
1531          */
1532         wmb();
1533
1534         ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE;
1535         return desc;
1536 }
1537
1538 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1539 {
1540         if (execlists->ctrl_reg) {
1541                 writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1542                 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1543         } else {
1544                 writel(upper_32_bits(desc), execlists->submit_reg);
1545                 writel(lower_32_bits(desc), execlists->submit_reg);
1546         }
1547 }
1548
1549 static __maybe_unused char *
1550 dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq)
1551 {
1552         if (!rq)
1553                 return "";
1554
1555         snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d",
1556                  prefix,
1557                  rq->context->lrc.ccid,
1558                  rq->fence.context, rq->fence.seqno,
1559                  i915_request_completed(rq) ? "!" :
1560                  i915_request_started(rq) ? "*" :
1561                  "",
1562                  rq_prio(rq));
1563
1564         return buf;
1565 }
1566
1567 static __maybe_unused void
1568 trace_ports(const struct intel_engine_execlists *execlists,
1569             const char *msg,
1570             struct i915_request * const *ports)
1571 {
1572         const struct intel_engine_cs *engine =
1573                 container_of(execlists, typeof(*engine), execlists);
1574         char __maybe_unused p0[40], p1[40];
1575
1576         if (!ports[0])
1577                 return;
1578
1579         ENGINE_TRACE(engine, "%s { %s%s }\n", msg,
1580                      dump_port(p0, sizeof(p0), "", ports[0]),
1581                      dump_port(p1, sizeof(p1), ", ", ports[1]));
1582 }
1583
1584 static inline bool
1585 reset_in_progress(const struct intel_engine_execlists *execlists)
1586 {
1587         return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1588 }
1589
1590 static __maybe_unused bool
1591 assert_pending_valid(const struct intel_engine_execlists *execlists,
1592                      const char *msg)
1593 {
1594         struct intel_engine_cs *engine =
1595                 container_of(execlists, typeof(*engine), execlists);
1596         struct i915_request * const *port, *rq;
1597         struct intel_context *ce = NULL;
1598         bool sentinel = false;
1599         u32 ccid = -1;
1600
1601         trace_ports(execlists, msg, execlists->pending);
1602
1603         /* We may be messing around with the lists during reset, lalala */
1604         if (reset_in_progress(execlists))
1605                 return true;
1606
1607         if (!execlists->pending[0]) {
1608                 GEM_TRACE_ERR("%s: Nothing pending for promotion!\n",
1609                               engine->name);
1610                 return false;
1611         }
1612
1613         if (execlists->pending[execlists_num_ports(execlists)]) {
1614                 GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n",
1615                               engine->name, execlists_num_ports(execlists));
1616                 return false;
1617         }
1618
1619         for (port = execlists->pending; (rq = *port); port++) {
1620                 unsigned long flags;
1621                 bool ok = true;
1622
1623                 GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1624                 GEM_BUG_ON(!i915_request_is_active(rq));
1625
1626                 if (ce == rq->context) {
1627                         GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n",
1628                                       engine->name,
1629                                       ce->timeline->fence_context,
1630                                       port - execlists->pending);
1631                         return false;
1632                 }
1633                 ce = rq->context;
1634
1635                 if (ccid == ce->lrc.ccid) {
1636                         GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n",
1637                                       engine->name,
1638                                       ccid, ce->timeline->fence_context,
1639                                       port - execlists->pending);
1640                         return false;
1641                 }
1642                 ccid = ce->lrc.ccid;
1643
1644                 /*
1645                  * Sentinels are supposed to be the last request so they flush
1646                  * the current execution off the HW. Check that they are the only
1647                  * request in the pending submission.
1648                  */
1649                 if (sentinel) {
1650                         GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n",
1651                                       engine->name,
1652                                       ce->timeline->fence_context,
1653                                       port - execlists->pending);
1654                         return false;
1655                 }
1656                 sentinel = i915_request_has_sentinel(rq);
1657
1658                 /* Hold tightly onto the lock to prevent concurrent retires! */
1659                 if (!spin_trylock_irqsave(&rq->lock, flags))
1660                         continue;
1661
1662                 if (i915_request_completed(rq))
1663                         goto unlock;
1664
1665                 if (i915_active_is_idle(&ce->active) &&
1666                     !intel_context_is_barrier(ce)) {
1667                         GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n",
1668                                       engine->name,
1669                                       ce->timeline->fence_context,
1670                                       port - execlists->pending);
1671                         ok = false;
1672                         goto unlock;
1673                 }
1674
1675                 if (!i915_vma_is_pinned(ce->state)) {
1676                         GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n",
1677                                       engine->name,
1678                                       ce->timeline->fence_context,
1679                                       port - execlists->pending);
1680                         ok = false;
1681                         goto unlock;
1682                 }
1683
1684                 if (!i915_vma_is_pinned(ce->ring->vma)) {
1685                         GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n",
1686                                       engine->name,
1687                                       ce->timeline->fence_context,
1688                                       port - execlists->pending);
1689                         ok = false;
1690                         goto unlock;
1691                 }
1692
1693 unlock:
1694                 spin_unlock_irqrestore(&rq->lock, flags);
1695                 if (!ok)
1696                         return false;
1697         }
1698
1699         return ce;
1700 }
1701
1702 static void execlists_submit_ports(struct intel_engine_cs *engine)
1703 {
1704         struct intel_engine_execlists *execlists = &engine->execlists;
1705         unsigned int n;
1706
1707         GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1708
1709         /*
1710          * We can skip acquiring intel_runtime_pm_get() here as it was taken
1711          * on our behalf by the request (see i915_gem_mark_busy()) and it will
1712          * not be relinquished until the device is idle (see
1713          * i915_gem_idle_work_handler()). As a precaution, we make sure
1714          * that all ELSP are drained i.e. we have processed the CSB,
1715          * before allowing ourselves to idle and calling intel_runtime_pm_put().
1716          */
1717         GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1718
1719         /*
1720          * ELSQ note: the submit queue is not cleared after being submitted
1721          * to the HW so we need to make sure we always clean it up. This is
1722          * currently ensured by the fact that we always write the same number
1723          * of elsq entries, keep this in mind before changing the loop below.
1724          */
1725         for (n = execlists_num_ports(execlists); n--; ) {
1726                 struct i915_request *rq = execlists->pending[n];
1727
1728                 write_desc(execlists,
1729                            rq ? execlists_update_context(rq) : 0,
1730                            n);
1731         }
1732
1733         /* we need to manually load the submit queue */
1734         if (execlists->ctrl_reg)
1735                 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1736 }
1737
1738 static bool ctx_single_port_submission(const struct intel_context *ce)
1739 {
1740         return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1741                 intel_context_force_single_submission(ce));
1742 }
1743
1744 static bool can_merge_ctx(const struct intel_context *prev,
1745                           const struct intel_context *next)
1746 {
1747         if (prev != next)
1748                 return false;
1749
1750         if (ctx_single_port_submission(prev))
1751                 return false;
1752
1753         return true;
1754 }
1755
1756 static unsigned long i915_request_flags(const struct i915_request *rq)
1757 {
1758         return READ_ONCE(rq->fence.flags);
1759 }
1760
1761 static bool can_merge_rq(const struct i915_request *prev,
1762                          const struct i915_request *next)
1763 {
1764         GEM_BUG_ON(prev == next);
1765         GEM_BUG_ON(!assert_priority_queue(prev, next));
1766
1767         /*
1768          * We do not submit known completed requests. Therefore if the next
1769          * request is already completed, we can pretend to merge it in
1770          * with the previous context (and we will skip updating the ELSP
1771          * and tracking). Thus hopefully keeping the ELSP full with active
1772          * contexts, despite the best efforts of preempt-to-busy to confuse
1773          * us.
1774          */
1775         if (i915_request_completed(next))
1776                 return true;
1777
1778         if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) &
1779                      (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1780                       BIT(I915_FENCE_FLAG_SENTINEL))))
1781                 return false;
1782
1783         if (!can_merge_ctx(prev->context, next->context))
1784                 return false;
1785
1786         GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
1787         return true;
1788 }
1789
1790 static void virtual_update_register_offsets(u32 *regs,
1791                                             struct intel_engine_cs *engine)
1792 {
1793         set_offsets(regs, reg_offsets(engine), engine, false);
1794 }
1795
1796 static bool virtual_matches(const struct virtual_engine *ve,
1797                             const struct i915_request *rq,
1798                             const struct intel_engine_cs *engine)
1799 {
1800         const struct intel_engine_cs *inflight;
1801
1802         if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1803                 return false;
1804
1805         /*
1806          * We track when the HW has completed saving the context image
1807          * (i.e. when we have seen the final CS event switching out of
1808          * the context) and must not overwrite the context image before
1809          * then. This restricts us to only using the active engine
1810          * while the previous virtualized request is inflight (so
1811          * we reuse the register offsets). This is a very small
1812          * hystersis on the greedy seelction algorithm.
1813          */
1814         inflight = intel_context_inflight(&ve->context);
1815         if (inflight && inflight != engine)
1816                 return false;
1817
1818         return true;
1819 }
1820
1821 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve)
1822 {
1823         /*
1824          * All the outstanding signals on ve->siblings[0] must have
1825          * been completed, just pending the interrupt handler. As those
1826          * signals still refer to the old sibling (via rq->engine), we must
1827          * transfer those to the old irq_worker to keep our locking
1828          * consistent.
1829          */
1830         intel_engine_transfer_stale_breadcrumbs(ve->siblings[0], &ve->context);
1831 }
1832
1833 #define for_each_waiter(p__, rq__) \
1834         list_for_each_entry_lockless(p__, \
1835                                      &(rq__)->sched.waiters_list, \
1836                                      wait_link)
1837
1838 #define for_each_signaler(p__, rq__) \
1839         list_for_each_entry_rcu(p__, \
1840                                 &(rq__)->sched.signalers_list, \
1841                                 signal_link)
1842
1843 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1844 {
1845         LIST_HEAD(list);
1846
1847         /*
1848          * We want to move the interrupted request to the back of
1849          * the round-robin list (i.e. its priority level), but
1850          * in doing so, we must then move all requests that were in
1851          * flight and were waiting for the interrupted request to
1852          * be run after it again.
1853          */
1854         do {
1855                 struct i915_dependency *p;
1856
1857                 GEM_BUG_ON(i915_request_is_active(rq));
1858                 list_move_tail(&rq->sched.link, pl);
1859
1860                 for_each_waiter(p, rq) {
1861                         struct i915_request *w =
1862                                 container_of(p->waiter, typeof(*w), sched);
1863
1864                         if (p->flags & I915_DEPENDENCY_WEAK)
1865                                 continue;
1866
1867                         /* Leave semaphores spinning on the other engines */
1868                         if (w->engine != rq->engine)
1869                                 continue;
1870
1871                         /* No waiter should start before its signaler */
1872                         GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) &&
1873                                    i915_request_started(w) &&
1874                                    !i915_request_completed(rq));
1875
1876                         GEM_BUG_ON(i915_request_is_active(w));
1877                         if (!i915_request_is_ready(w))
1878                                 continue;
1879
1880                         if (rq_prio(w) < rq_prio(rq))
1881                                 continue;
1882
1883                         GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1884                         list_move_tail(&w->sched.link, &list);
1885                 }
1886
1887                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1888         } while (rq);
1889 }
1890
1891 static void defer_active(struct intel_engine_cs *engine)
1892 {
1893         struct i915_request *rq;
1894
1895         rq = __unwind_incomplete_requests(engine);
1896         if (!rq)
1897                 return;
1898
1899         defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1900 }
1901
1902 static bool
1903 need_timeslice(const struct intel_engine_cs *engine,
1904                const struct i915_request *rq,
1905                const struct rb_node *rb)
1906 {
1907         int hint;
1908
1909         if (!intel_engine_has_timeslices(engine))
1910                 return false;
1911
1912         hint = engine->execlists.queue_priority_hint;
1913
1914         if (rb) {
1915                 const struct virtual_engine *ve =
1916                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1917                 const struct intel_engine_cs *inflight =
1918                         intel_context_inflight(&ve->context);
1919
1920                 if (!inflight || inflight == engine) {
1921                         struct i915_request *next;
1922
1923                         rcu_read_lock();
1924                         next = READ_ONCE(ve->request);
1925                         if (next)
1926                                 hint = max(hint, rq_prio(next));
1927                         rcu_read_unlock();
1928                 }
1929         }
1930
1931         if (!list_is_last(&rq->sched.link, &engine->active.requests))
1932                 hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1933
1934         GEM_BUG_ON(hint >= I915_PRIORITY_UNPREEMPTABLE);
1935         return hint >= effective_prio(rq);
1936 }
1937
1938 static bool
1939 timeslice_yield(const struct intel_engine_execlists *el,
1940                 const struct i915_request *rq)
1941 {
1942         /*
1943          * Once bitten, forever smitten!
1944          *
1945          * If the active context ever busy-waited on a semaphore,
1946          * it will be treated as a hog until the end of its timeslice (i.e.
1947          * until it is scheduled out and replaced by a new submission,
1948          * possibly even its own lite-restore). The HW only sends an interrupt
1949          * on the first miss, and we do know if that semaphore has been
1950          * signaled, or even if it is now stuck on another semaphore. Play
1951          * safe, yield if it might be stuck -- it will be given a fresh
1952          * timeslice in the near future.
1953          */
1954         return rq->context->lrc.ccid == READ_ONCE(el->yield);
1955 }
1956
1957 static bool
1958 timeslice_expired(const struct intel_engine_execlists *el,
1959                   const struct i915_request *rq)
1960 {
1961         return timer_expired(&el->timer) || timeslice_yield(el, rq);
1962 }
1963
1964 static int
1965 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1966 {
1967         if (list_is_last(&rq->sched.link, &engine->active.requests))
1968                 return engine->execlists.queue_priority_hint;
1969
1970         return rq_prio(list_next_entry(rq, sched.link));
1971 }
1972
1973 static inline unsigned long
1974 timeslice(const struct intel_engine_cs *engine)
1975 {
1976         return READ_ONCE(engine->props.timeslice_duration_ms);
1977 }
1978
1979 static unsigned long active_timeslice(const struct intel_engine_cs *engine)
1980 {
1981         const struct intel_engine_execlists *execlists = &engine->execlists;
1982         const struct i915_request *rq = *execlists->active;
1983
1984         if (!rq || i915_request_completed(rq))
1985                 return 0;
1986
1987         if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq))
1988                 return 0;
1989
1990         return timeslice(engine);
1991 }
1992
1993 static void set_timeslice(struct intel_engine_cs *engine)
1994 {
1995         unsigned long duration;
1996
1997         if (!intel_engine_has_timeslices(engine))
1998                 return;
1999
2000         duration = active_timeslice(engine);
2001         ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration);
2002
2003         set_timer_ms(&engine->execlists.timer, duration);
2004 }
2005
2006 static void start_timeslice(struct intel_engine_cs *engine, int prio)
2007 {
2008         struct intel_engine_execlists *execlists = &engine->execlists;
2009         unsigned long duration;
2010
2011         if (!intel_engine_has_timeslices(engine))
2012                 return;
2013
2014         WRITE_ONCE(execlists->switch_priority_hint, prio);
2015         if (prio == INT_MIN)
2016                 return;
2017
2018         if (timer_pending(&execlists->timer))
2019                 return;
2020
2021         duration = timeslice(engine);
2022         ENGINE_TRACE(engine,
2023                      "start timeslicing, prio:%d, interval:%lu",
2024                      prio, duration);
2025
2026         set_timer_ms(&execlists->timer, duration);
2027 }
2028
2029 static void record_preemption(struct intel_engine_execlists *execlists)
2030 {
2031         (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
2032 }
2033
2034 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
2035                                             const struct i915_request *rq)
2036 {
2037         if (!rq)
2038                 return 0;
2039
2040         /* Force a fast reset for terminated contexts (ignoring sysfs!) */
2041         if (unlikely(intel_context_is_banned(rq->context)))
2042                 return 1;
2043
2044         return READ_ONCE(engine->props.preempt_timeout_ms);
2045 }
2046
2047 static void set_preempt_timeout(struct intel_engine_cs *engine,
2048                                 const struct i915_request *rq)
2049 {
2050         if (!intel_engine_has_preempt_reset(engine))
2051                 return;
2052
2053         set_timer_ms(&engine->execlists.preempt,
2054                      active_preempt_timeout(engine, rq));
2055 }
2056
2057 static inline void clear_ports(struct i915_request **ports, int count)
2058 {
2059         memset_p((void **)ports, NULL, count);
2060 }
2061
2062 static inline void
2063 copy_ports(struct i915_request **dst, struct i915_request **src, int count)
2064 {
2065         /* A memcpy_p() would be very useful here! */
2066         while (count--)
2067                 WRITE_ONCE(*dst++, *src++); /* avoid write tearing */
2068 }
2069
2070 static void execlists_dequeue(struct intel_engine_cs *engine)
2071 {
2072         struct intel_engine_execlists * const execlists = &engine->execlists;
2073         struct i915_request **port = execlists->pending;
2074         struct i915_request ** const last_port = port + execlists->port_mask;
2075         struct i915_request * const *active;
2076         struct i915_request *last;
2077         struct rb_node *rb;
2078         bool submit = false;
2079
2080         /*
2081          * Hardware submission is through 2 ports. Conceptually each port
2082          * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
2083          * static for a context, and unique to each, so we only execute
2084          * requests belonging to a single context from each ring. RING_HEAD
2085          * is maintained by the CS in the context image, it marks the place
2086          * where it got up to last time, and through RING_TAIL we tell the CS
2087          * where we want to execute up to this time.
2088          *
2089          * In this list the requests are in order of execution. Consecutive
2090          * requests from the same context are adjacent in the ringbuffer. We
2091          * can combine these requests into a single RING_TAIL update:
2092          *
2093          *              RING_HEAD...req1...req2
2094          *                                    ^- RING_TAIL
2095          * since to execute req2 the CS must first execute req1.
2096          *
2097          * Our goal then is to point each port to the end of a consecutive
2098          * sequence of requests as being the most optimal (fewest wake ups
2099          * and context switches) submission.
2100          */
2101
2102         for (rb = rb_first_cached(&execlists->virtual); rb; ) {
2103                 struct virtual_engine *ve =
2104                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2105                 struct i915_request *rq = READ_ONCE(ve->request);
2106
2107                 if (!rq) { /* lazily cleanup after another engine handled rq */
2108                         rb_erase_cached(rb, &execlists->virtual);
2109                         RB_CLEAR_NODE(rb);
2110                         rb = rb_first_cached(&execlists->virtual);
2111                         continue;
2112                 }
2113
2114                 if (!virtual_matches(ve, rq, engine)) {
2115                         rb = rb_next(rb);
2116                         continue;
2117                 }
2118
2119                 break;
2120         }
2121
2122         /*
2123          * If the queue is higher priority than the last
2124          * request in the currently active context, submit afresh.
2125          * We will resubmit again afterwards in case we need to split
2126          * the active context to interject the preemption request,
2127          * i.e. we will retrigger preemption following the ack in case
2128          * of trouble.
2129          */
2130         active = READ_ONCE(execlists->active);
2131
2132         /*
2133          * In theory we can skip over completed contexts that have not
2134          * yet been processed by events (as those events are in flight):
2135          *
2136          * while ((last = *active) && i915_request_completed(last))
2137          *      active++;
2138          *
2139          * However, the GPU cannot handle this as it will ultimately
2140          * find itself trying to jump back into a context it has just
2141          * completed and barf.
2142          */
2143
2144         if ((last = *active)) {
2145                 if (need_preempt(engine, last, rb)) {
2146                         if (i915_request_completed(last)) {
2147                                 tasklet_hi_schedule(&execlists->tasklet);
2148                                 return;
2149                         }
2150
2151                         ENGINE_TRACE(engine,
2152                                      "preempting last=%llx:%lld, prio=%d, hint=%d\n",
2153                                      last->fence.context,
2154                                      last->fence.seqno,
2155                                      last->sched.attr.priority,
2156                                      execlists->queue_priority_hint);
2157                         record_preemption(execlists);
2158
2159                         /*
2160                          * Don't let the RING_HEAD advance past the breadcrumb
2161                          * as we unwind (and until we resubmit) so that we do
2162                          * not accidentally tell it to go backwards.
2163                          */
2164                         ring_set_paused(engine, 1);
2165
2166                         /*
2167                          * Note that we have not stopped the GPU at this point,
2168                          * so we are unwinding the incomplete requests as they
2169                          * remain inflight and so by the time we do complete
2170                          * the preemption, some of the unwound requests may
2171                          * complete!
2172                          */
2173                         __unwind_incomplete_requests(engine);
2174
2175                         last = NULL;
2176                 } else if (need_timeslice(engine, last, rb) &&
2177                            timeslice_expired(execlists, last)) {
2178                         if (i915_request_completed(last)) {
2179                                 tasklet_hi_schedule(&execlists->tasklet);
2180                                 return;
2181                         }
2182
2183                         ENGINE_TRACE(engine,
2184                                      "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n",
2185                                      last->fence.context,
2186                                      last->fence.seqno,
2187                                      last->sched.attr.priority,
2188                                      execlists->queue_priority_hint,
2189                                      yesno(timeslice_yield(execlists, last)));
2190
2191                         ring_set_paused(engine, 1);
2192                         defer_active(engine);
2193
2194                         /*
2195                          * Unlike for preemption, if we rewind and continue
2196                          * executing the same context as previously active,
2197                          * the order of execution will remain the same and
2198                          * the tail will only advance. We do not need to
2199                          * force a full context restore, as a lite-restore
2200                          * is sufficient to resample the monotonic TAIL.
2201                          *
2202                          * If we switch to any other context, similarly we
2203                          * will not rewind TAIL of current context, and
2204                          * normal save/restore will preserve state and allow
2205                          * us to later continue executing the same request.
2206                          */
2207                         last = NULL;
2208                 } else {
2209                         /*
2210                          * Otherwise if we already have a request pending
2211                          * for execution after the current one, we can
2212                          * just wait until the next CS event before
2213                          * queuing more. In either case we will force a
2214                          * lite-restore preemption event, but if we wait
2215                          * we hopefully coalesce several updates into a single
2216                          * submission.
2217                          */
2218                         if (!list_is_last(&last->sched.link,
2219                                           &engine->active.requests)) {
2220                                 /*
2221                                  * Even if ELSP[1] is occupied and not worthy
2222                                  * of timeslices, our queue might be.
2223                                  */
2224                                 start_timeslice(engine, queue_prio(execlists));
2225                                 return;
2226                         }
2227                 }
2228         }
2229
2230         while (rb) { /* XXX virtual is always taking precedence */
2231                 struct virtual_engine *ve =
2232                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2233                 struct i915_request *rq;
2234
2235                 spin_lock(&ve->base.active.lock);
2236
2237                 rq = ve->request;
2238                 if (unlikely(!rq)) { /* lost the race to a sibling */
2239                         spin_unlock(&ve->base.active.lock);
2240                         rb_erase_cached(rb, &execlists->virtual);
2241                         RB_CLEAR_NODE(rb);
2242                         rb = rb_first_cached(&execlists->virtual);
2243                         continue;
2244                 }
2245
2246                 GEM_BUG_ON(rq != ve->request);
2247                 GEM_BUG_ON(rq->engine != &ve->base);
2248                 GEM_BUG_ON(rq->context != &ve->context);
2249
2250                 if (rq_prio(rq) >= queue_prio(execlists)) {
2251                         if (!virtual_matches(ve, rq, engine)) {
2252                                 spin_unlock(&ve->base.active.lock);
2253                                 rb = rb_next(rb);
2254                                 continue;
2255                         }
2256
2257                         if (last && !can_merge_rq(last, rq)) {
2258                                 spin_unlock(&ve->base.active.lock);
2259                                 start_timeslice(engine, rq_prio(rq));
2260                                 return; /* leave this for another sibling */
2261                         }
2262
2263                         ENGINE_TRACE(engine,
2264                                      "virtual rq=%llx:%lld%s, new engine? %s\n",
2265                                      rq->fence.context,
2266                                      rq->fence.seqno,
2267                                      i915_request_completed(rq) ? "!" :
2268                                      i915_request_started(rq) ? "*" :
2269                                      "",
2270                                      yesno(engine != ve->siblings[0]));
2271
2272                         WRITE_ONCE(ve->request, NULL);
2273                         WRITE_ONCE(ve->base.execlists.queue_priority_hint,
2274                                    INT_MIN);
2275                         rb_erase_cached(rb, &execlists->virtual);
2276                         RB_CLEAR_NODE(rb);
2277
2278                         GEM_BUG_ON(!(rq->execution_mask & engine->mask));
2279                         WRITE_ONCE(rq->engine, engine);
2280
2281                         if (engine != ve->siblings[0]) {
2282                                 u32 *regs = ve->context.lrc_reg_state;
2283                                 unsigned int n;
2284
2285                                 GEM_BUG_ON(READ_ONCE(ve->context.inflight));
2286
2287                                 if (!intel_engine_has_relative_mmio(engine))
2288                                         virtual_update_register_offsets(regs,
2289                                                                         engine);
2290
2291                                 if (!list_empty(&ve->context.signals))
2292                                         virtual_xfer_breadcrumbs(ve);
2293
2294                                 /*
2295                                  * Move the bound engine to the top of the list
2296                                  * for future execution. We then kick this
2297                                  * tasklet first before checking others, so that
2298                                  * we preferentially reuse this set of bound
2299                                  * registers.
2300                                  */
2301                                 for (n = 1; n < ve->num_siblings; n++) {
2302                                         if (ve->siblings[n] == engine) {
2303                                                 swap(ve->siblings[n],
2304                                                      ve->siblings[0]);
2305                                                 break;
2306                                         }
2307                                 }
2308
2309                                 GEM_BUG_ON(ve->siblings[0] != engine);
2310                         }
2311
2312                         if (__i915_request_submit(rq)) {
2313                                 submit = true;
2314                                 last = rq;
2315                         }
2316                         i915_request_put(rq);
2317
2318                         /*
2319                          * Hmm, we have a bunch of virtual engine requests,
2320                          * but the first one was already completed (thanks
2321                          * preempt-to-busy!). Keep looking at the veng queue
2322                          * until we have no more relevant requests (i.e.
2323                          * the normal submit queue has higher priority).
2324                          */
2325                         if (!submit) {
2326                                 spin_unlock(&ve->base.active.lock);
2327                                 rb = rb_first_cached(&execlists->virtual);
2328                                 continue;
2329                         }
2330                 }
2331
2332                 spin_unlock(&ve->base.active.lock);
2333                 break;
2334         }
2335
2336         while ((rb = rb_first_cached(&execlists->queue))) {
2337                 struct i915_priolist *p = to_priolist(rb);
2338                 struct i915_request *rq, *rn;
2339                 int i;
2340
2341                 priolist_for_each_request_consume(rq, rn, p, i) {
2342                         bool merge = true;
2343
2344                         /*
2345                          * Can we combine this request with the current port?
2346                          * It has to be the same context/ringbuffer and not
2347                          * have any exceptions (e.g. GVT saying never to
2348                          * combine contexts).
2349                          *
2350                          * If we can combine the requests, we can execute both
2351                          * by updating the RING_TAIL to point to the end of the
2352                          * second request, and so we never need to tell the
2353                          * hardware about the first.
2354                          */
2355                         if (last && !can_merge_rq(last, rq)) {
2356                                 /*
2357                                  * If we are on the second port and cannot
2358                                  * combine this request with the last, then we
2359                                  * are done.
2360                                  */
2361                                 if (port == last_port)
2362                                         goto done;
2363
2364                                 /*
2365                                  * We must not populate both ELSP[] with the
2366                                  * same LRCA, i.e. we must submit 2 different
2367                                  * contexts if we submit 2 ELSP.
2368                                  */
2369                                 if (last->context == rq->context)
2370                                         goto done;
2371
2372                                 if (i915_request_has_sentinel(last))
2373                                         goto done;
2374
2375                                 /*
2376                                  * If GVT overrides us we only ever submit
2377                                  * port[0], leaving port[1] empty. Note that we
2378                                  * also have to be careful that we don't queue
2379                                  * the same context (even though a different
2380                                  * request) to the second port.
2381                                  */
2382                                 if (ctx_single_port_submission(last->context) ||
2383                                     ctx_single_port_submission(rq->context))
2384                                         goto done;
2385
2386                                 merge = false;
2387                         }
2388
2389                         if (__i915_request_submit(rq)) {
2390                                 if (!merge) {
2391                                         *port = execlists_schedule_in(last, port - execlists->pending);
2392                                         port++;
2393                                         last = NULL;
2394                                 }
2395
2396                                 GEM_BUG_ON(last &&
2397                                            !can_merge_ctx(last->context,
2398                                                           rq->context));
2399                                 GEM_BUG_ON(last &&
2400                                            i915_seqno_passed(last->fence.seqno,
2401                                                              rq->fence.seqno));
2402
2403                                 submit = true;
2404                                 last = rq;
2405                         }
2406                 }
2407
2408                 rb_erase_cached(&p->node, &execlists->queue);
2409                 i915_priolist_free(p);
2410         }
2411
2412 done:
2413         /*
2414          * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2415          *
2416          * We choose the priority hint such that if we add a request of greater
2417          * priority than this, we kick the submission tasklet to decide on
2418          * the right order of submitting the requests to hardware. We must
2419          * also be prepared to reorder requests as they are in-flight on the
2420          * HW. We derive the priority hint then as the first "hole" in
2421          * the HW submission ports and if there are no available slots,
2422          * the priority of the lowest executing request, i.e. last.
2423          *
2424          * When we do receive a higher priority request ready to run from the
2425          * user, see queue_request(), the priority hint is bumped to that
2426          * request triggering preemption on the next dequeue (or subsequent
2427          * interrupt for secondary ports).
2428          */
2429         execlists->queue_priority_hint = queue_prio(execlists);
2430
2431         if (submit) {
2432                 *port = execlists_schedule_in(last, port - execlists->pending);
2433                 execlists->switch_priority_hint =
2434                         switch_prio(engine, *execlists->pending);
2435
2436                 /*
2437                  * Skip if we ended up with exactly the same set of requests,
2438                  * e.g. trying to timeslice a pair of ordered contexts
2439                  */
2440                 if (!memcmp(active, execlists->pending,
2441                             (port - execlists->pending + 1) * sizeof(*port))) {
2442                         do
2443                                 execlists_schedule_out(fetch_and_zero(port));
2444                         while (port-- != execlists->pending);
2445
2446                         goto skip_submit;
2447                 }
2448                 clear_ports(port + 1, last_port - port);
2449
2450                 WRITE_ONCE(execlists->yield, -1);
2451                 set_preempt_timeout(engine, *active);
2452                 execlists_submit_ports(engine);
2453         } else {
2454                 start_timeslice(engine, execlists->queue_priority_hint);
2455 skip_submit:
2456                 ring_set_paused(engine, 0);
2457         }
2458 }
2459
2460 static void
2461 cancel_port_requests(struct intel_engine_execlists * const execlists)
2462 {
2463         struct i915_request * const *port;
2464
2465         for (port = execlists->pending; *port; port++)
2466                 execlists_schedule_out(*port);
2467         clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2468
2469         /* Mark the end of active before we overwrite *active */
2470         for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2471                 execlists_schedule_out(*port);
2472         clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2473
2474         smp_wmb(); /* complete the seqlock for execlists_active() */
2475         WRITE_ONCE(execlists->active, execlists->inflight);
2476 }
2477
2478 static inline void
2479 invalidate_csb_entries(const u32 *first, const u32 *last)
2480 {
2481         clflush((void *)first);
2482         clflush((void *)last);
2483 }
2484
2485 /*
2486  * Starting with Gen12, the status has a new format:
2487  *
2488  *     bit  0:     switched to new queue
2489  *     bit  1:     reserved
2490  *     bit  2:     semaphore wait mode (poll or signal), only valid when
2491  *                 switch detail is set to "wait on semaphore"
2492  *     bits 3-5:   engine class
2493  *     bits 6-11:  engine instance
2494  *     bits 12-14: reserved
2495  *     bits 15-25: sw context id of the lrc the GT switched to
2496  *     bits 26-31: sw counter of the lrc the GT switched to
2497  *     bits 32-35: context switch detail
2498  *                  - 0: ctx complete
2499  *                  - 1: wait on sync flip
2500  *                  - 2: wait on vblank
2501  *                  - 3: wait on scanline
2502  *                  - 4: wait on semaphore
2503  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2504  *                       WAIT_FOR_EVENT)
2505  *     bit  36:    reserved
2506  *     bits 37-43: wait detail (for switch detail 1 to 4)
2507  *     bits 44-46: reserved
2508  *     bits 47-57: sw context id of the lrc the GT switched away from
2509  *     bits 58-63: sw counter of the lrc the GT switched away from
2510  */
2511 static inline bool
2512 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2513 {
2514         u32 lower_dw = csb[0];
2515         u32 upper_dw = csb[1];
2516         bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
2517         bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
2518         bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2519
2520         /*
2521          * The context switch detail is not guaranteed to be 5 when a preemption
2522          * occurs, so we can't just check for that. The check below works for
2523          * all the cases we care about, including preemptions of WAIT
2524          * instructions and lite-restore. Preempt-to-idle via the CTRL register
2525          * would require some extra handling, but we don't support that.
2526          */
2527         if (!ctx_away_valid || new_queue) {
2528                 GEM_BUG_ON(!ctx_to_valid);
2529                 return true;
2530         }
2531
2532         /*
2533          * switch detail = 5 is covered by the case above and we do not expect a
2534          * context switch on an unsuccessful wait instruction since we always
2535          * use polling mode.
2536          */
2537         GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2538         return false;
2539 }
2540
2541 static inline bool
2542 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2543 {
2544         return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2545 }
2546
2547 static void process_csb(struct intel_engine_cs *engine)
2548 {
2549         struct intel_engine_execlists * const execlists = &engine->execlists;
2550         const u32 * const buf = execlists->csb_status;
2551         const u8 num_entries = execlists->csb_size;
2552         u8 head, tail;
2553
2554         /*
2555          * As we modify our execlists state tracking we require exclusive
2556          * access. Either we are inside the tasklet, or the tasklet is disabled
2557          * and we assume that is only inside the reset paths and so serialised.
2558          */
2559         GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2560                    !reset_in_progress(execlists));
2561         GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2562
2563         /*
2564          * Note that csb_write, csb_status may be either in HWSP or mmio.
2565          * When reading from the csb_write mmio register, we have to be
2566          * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2567          * the low 4bits. As it happens we know the next 4bits are always
2568          * zero and so we can simply masked off the low u8 of the register
2569          * and treat it identically to reading from the HWSP (without having
2570          * to use explicit shifting and masking, and probably bifurcating
2571          * the code to handle the legacy mmio read).
2572          */
2573         head = execlists->csb_head;
2574         tail = READ_ONCE(*execlists->csb_write);
2575         if (unlikely(head == tail))
2576                 return;
2577
2578         /*
2579          * We will consume all events from HW, or at least pretend to.
2580          *
2581          * The sequence of events from the HW is deterministic, and derived
2582          * from our writes to the ELSP, with a smidgen of variability for
2583          * the arrival of the asynchronous requests wrt to the inflight
2584          * execution. If the HW sends an event that does not correspond with
2585          * the one we are expecting, we have to abandon all hope as we lose
2586          * all tracking of what the engine is actually executing. We will
2587          * only detect we are out of sequence with the HW when we get an
2588          * 'impossible' event because we have already drained our own
2589          * preemption/promotion queue. If this occurs, we know that we likely
2590          * lost track of execution earlier and must unwind and restart, the
2591          * simplest way is by stop processing the event queue and force the
2592          * engine to reset.
2593          */
2594         execlists->csb_head = tail;
2595         ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2596
2597         /*
2598          * Hopefully paired with a wmb() in HW!
2599          *
2600          * We must complete the read of the write pointer before any reads
2601          * from the CSB, so that we do not see stale values. Without an rmb
2602          * (lfence) the HW may speculatively perform the CSB[] reads *before*
2603          * we perform the READ_ONCE(*csb_write).
2604          */
2605         rmb();
2606         do {
2607                 bool promote;
2608
2609                 if (++head == num_entries)
2610                         head = 0;
2611
2612                 /*
2613                  * We are flying near dragons again.
2614                  *
2615                  * We hold a reference to the request in execlist_port[]
2616                  * but no more than that. We are operating in softirq
2617                  * context and so cannot hold any mutex or sleep. That
2618                  * prevents us stopping the requests we are processing
2619                  * in port[] from being retired simultaneously (the
2620                  * breadcrumb will be complete before we see the
2621                  * context-switch). As we only hold the reference to the
2622                  * request, any pointer chasing underneath the request
2623                  * is subject to a potential use-after-free. Thus we
2624                  * store all of the bookkeeping within port[] as
2625                  * required, and avoid using unguarded pointers beneath
2626                  * request itself. The same applies to the atomic
2627                  * status notifier.
2628                  */
2629
2630                 ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2631                              head, buf[2 * head + 0], buf[2 * head + 1]);
2632
2633                 if (INTEL_GEN(engine->i915) >= 12)
2634                         promote = gen12_csb_parse(execlists, buf + 2 * head);
2635                 else
2636                         promote = gen8_csb_parse(execlists, buf + 2 * head);
2637                 if (promote) {
2638                         struct i915_request * const *old = execlists->active;
2639
2640                         if (GEM_WARN_ON(!*execlists->pending)) {
2641                                 execlists->error_interrupt |= ERROR_CSB;
2642                                 break;
2643                         }
2644
2645                         ring_set_paused(engine, 0);
2646
2647                         /* Point active to the new ELSP; prevent overwriting */
2648                         WRITE_ONCE(execlists->active, execlists->pending);
2649                         smp_wmb(); /* notify execlists_active() */
2650
2651                         /* cancel old inflight, prepare for switch */
2652                         trace_ports(execlists, "preempted", old);
2653                         while (*old)
2654                                 execlists_schedule_out(*old++);
2655
2656                         /* switch pending to inflight */
2657                         GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2658                         copy_ports(execlists->inflight,
2659                                    execlists->pending,
2660                                    execlists_num_ports(execlists));
2661                         smp_wmb(); /* complete the seqlock */
2662                         WRITE_ONCE(execlists->active, execlists->inflight);
2663
2664                         /* XXX Magic delay for tgl */
2665                         ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
2666
2667                         WRITE_ONCE(execlists->pending[0], NULL);
2668                 } else {
2669                         if (GEM_WARN_ON(!*execlists->active)) {
2670                                 execlists->error_interrupt |= ERROR_CSB;
2671                                 break;
2672                         }
2673
2674                         /* port0 completed, advanced to port1 */
2675                         trace_ports(execlists, "completed", execlists->active);
2676
2677                         /*
2678                          * We rely on the hardware being strongly
2679                          * ordered, that the breadcrumb write is
2680                          * coherent (visible from the CPU) before the
2681                          * user interrupt is processed. One might assume
2682                          * that the breadcrumb write being before the
2683                          * user interrupt and the CS event for the context
2684                          * switch would therefore be before the CS event
2685                          * itself...
2686                          */
2687                         if (GEM_SHOW_DEBUG() &&
2688                             !i915_request_completed(*execlists->active)) {
2689                                 struct i915_request *rq = *execlists->active;
2690                                 const u32 *regs __maybe_unused =
2691                                         rq->context->lrc_reg_state;
2692
2693                                 ENGINE_TRACE(engine,
2694                                              "context completed before request!\n");
2695                                 ENGINE_TRACE(engine,
2696                                              "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
2697                                              ENGINE_READ(engine, RING_START),
2698                                              ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
2699                                              ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
2700                                              ENGINE_READ(engine, RING_CTL),
2701                                              ENGINE_READ(engine, RING_MI_MODE));
2702                                 ENGINE_TRACE(engine,
2703                                              "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
2704                                              i915_ggtt_offset(rq->ring->vma),
2705                                              rq->head, rq->tail,
2706                                              rq->fence.context,
2707                                              lower_32_bits(rq->fence.seqno),
2708                                              hwsp_seqno(rq));
2709                                 ENGINE_TRACE(engine,
2710                                              "ctx:{start:%08x, head:%04x, tail:%04x}, ",
2711                                              regs[CTX_RING_START],
2712                                              regs[CTX_RING_HEAD],
2713                                              regs[CTX_RING_TAIL]);
2714                         }
2715
2716                         execlists_schedule_out(*execlists->active++);
2717
2718                         GEM_BUG_ON(execlists->active - execlists->inflight >
2719                                    execlists_num_ports(execlists));
2720                 }
2721         } while (head != tail);
2722
2723         set_timeslice(engine);
2724
2725         /*
2726          * Gen11 has proven to fail wrt global observation point between
2727          * entry and tail update, failing on the ordering and thus
2728          * we see an old entry in the context status buffer.
2729          *
2730          * Forcibly evict out entries for the next gpu csb update,
2731          * to increase the odds that we get a fresh entries with non
2732          * working hardware. The cost for doing so comes out mostly with
2733          * the wash as hardware, working or not, will need to do the
2734          * invalidation before.
2735          */
2736         invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2737 }
2738
2739 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2740 {
2741         lockdep_assert_held(&engine->active.lock);
2742         if (!READ_ONCE(engine->execlists.pending[0])) {
2743                 rcu_read_lock(); /* protect peeking at execlists->active */
2744                 execlists_dequeue(engine);
2745                 rcu_read_unlock();
2746         }
2747 }
2748
2749 static void __execlists_hold(struct i915_request *rq)
2750 {
2751         LIST_HEAD(list);
2752
2753         do {
2754                 struct i915_dependency *p;
2755
2756                 if (i915_request_is_active(rq))
2757                         __i915_request_unsubmit(rq);
2758
2759                 clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2760                 list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2761                 i915_request_set_hold(rq);
2762                 RQ_TRACE(rq, "on hold\n");
2763
2764                 for_each_waiter(p, rq) {
2765                         struct i915_request *w =
2766                                 container_of(p->waiter, typeof(*w), sched);
2767
2768                         /* Leave semaphores spinning on the other engines */
2769                         if (w->engine != rq->engine)
2770                                 continue;
2771
2772                         if (!i915_request_is_ready(w))
2773                                 continue;
2774
2775                         if (i915_request_completed(w))
2776                                 continue;
2777
2778                         if (i915_request_on_hold(w))
2779                                 continue;
2780
2781                         list_move_tail(&w->sched.link, &list);
2782                 }
2783
2784                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2785         } while (rq);
2786 }
2787
2788 static bool execlists_hold(struct intel_engine_cs *engine,
2789                            struct i915_request *rq)
2790 {
2791         spin_lock_irq(&engine->active.lock);
2792
2793         if (i915_request_completed(rq)) { /* too late! */
2794                 rq = NULL;
2795                 goto unlock;
2796         }
2797
2798         if (rq->engine != engine) { /* preempted virtual engine */
2799                 struct virtual_engine *ve = to_virtual_engine(rq->engine);
2800
2801                 /*
2802                  * intel_context_inflight() is only protected by virtue
2803                  * of process_csb() being called only by the tasklet (or
2804                  * directly from inside reset while the tasklet is suspended).
2805                  * Assert that neither of those are allowed to run while we
2806                  * poke at the request queues.
2807                  */
2808                 GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2809
2810                 /*
2811                  * An unsubmitted request along a virtual engine will
2812                  * remain on the active (this) engine until we are able
2813                  * to process the context switch away (and so mark the
2814                  * context as no longer in flight). That cannot have happened
2815                  * yet, otherwise we would not be hanging!
2816                  */
2817                 spin_lock(&ve->base.active.lock);
2818                 GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2819                 GEM_BUG_ON(ve->request != rq);
2820                 ve->request = NULL;
2821                 spin_unlock(&ve->base.active.lock);
2822                 i915_request_put(rq);
2823
2824                 rq->engine = engine;
2825         }
2826
2827         /*
2828          * Transfer this request onto the hold queue to prevent it
2829          * being resumbitted to HW (and potentially completed) before we have
2830          * released it. Since we may have already submitted following
2831          * requests, we need to remove those as well.
2832          */
2833         GEM_BUG_ON(i915_request_on_hold(rq));
2834         GEM_BUG_ON(rq->engine != engine);
2835         __execlists_hold(rq);
2836         GEM_BUG_ON(list_empty(&engine->active.hold));
2837
2838 unlock:
2839         spin_unlock_irq(&engine->active.lock);
2840         return rq;
2841 }
2842
2843 static bool hold_request(const struct i915_request *rq)
2844 {
2845         struct i915_dependency *p;
2846         bool result = false;
2847
2848         /*
2849          * If one of our ancestors is on hold, we must also be on hold,
2850          * otherwise we will bypass it and execute before it.
2851          */
2852         rcu_read_lock();
2853         for_each_signaler(p, rq) {
2854                 const struct i915_request *s =
2855                         container_of(p->signaler, typeof(*s), sched);
2856
2857                 if (s->engine != rq->engine)
2858                         continue;
2859
2860                 result = i915_request_on_hold(s);
2861                 if (result)
2862                         break;
2863         }
2864         rcu_read_unlock();
2865
2866         return result;
2867 }
2868
2869 static void __execlists_unhold(struct i915_request *rq)
2870 {
2871         LIST_HEAD(list);
2872
2873         do {
2874                 struct i915_dependency *p;
2875
2876                 RQ_TRACE(rq, "hold release\n");
2877
2878                 GEM_BUG_ON(!i915_request_on_hold(rq));
2879                 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2880
2881                 i915_request_clear_hold(rq);
2882                 list_move_tail(&rq->sched.link,
2883                                i915_sched_lookup_priolist(rq->engine,
2884                                                           rq_prio(rq)));
2885                 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2886
2887                 /* Also release any children on this engine that are ready */
2888                 for_each_waiter(p, rq) {
2889                         struct i915_request *w =
2890                                 container_of(p->waiter, typeof(*w), sched);
2891
2892                         /* Propagate any change in error status */
2893                         if (rq->fence.error)
2894                                 i915_request_set_error_once(w, rq->fence.error);
2895
2896                         if (w->engine != rq->engine)
2897                                 continue;
2898
2899                         if (!i915_request_on_hold(w))
2900                                 continue;
2901
2902                         /* Check that no other parents are also on hold */
2903                         if (hold_request(w))
2904                                 continue;
2905
2906                         list_move_tail(&w->sched.link, &list);
2907                 }
2908
2909                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2910         } while (rq);
2911 }
2912
2913 static void execlists_unhold(struct intel_engine_cs *engine,
2914                              struct i915_request *rq)
2915 {
2916         spin_lock_irq(&engine->active.lock);
2917
2918         /*
2919          * Move this request back to the priority queue, and all of its
2920          * children and grandchildren that were suspended along with it.
2921          */
2922         __execlists_unhold(rq);
2923
2924         if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2925                 engine->execlists.queue_priority_hint = rq_prio(rq);
2926                 tasklet_hi_schedule(&engine->execlists.tasklet);
2927         }
2928
2929         spin_unlock_irq(&engine->active.lock);
2930 }
2931
2932 struct execlists_capture {
2933         struct work_struct work;
2934         struct i915_request *rq;
2935         struct i915_gpu_coredump *error;
2936 };
2937
2938 static void execlists_capture_work(struct work_struct *work)
2939 {
2940         struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2941         const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2942         struct intel_engine_cs *engine = cap->rq->engine;
2943         struct intel_gt_coredump *gt = cap->error->gt;
2944         struct intel_engine_capture_vma *vma;
2945
2946         /* Compress all the objects attached to the request, slow! */
2947         vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2948         if (vma) {
2949                 struct i915_vma_compress *compress =
2950                         i915_vma_capture_prepare(gt);
2951
2952                 intel_engine_coredump_add_vma(gt->engine, vma, compress);
2953                 i915_vma_capture_finish(gt, compress);
2954         }
2955
2956         gt->simulated = gt->engine->simulated;
2957         cap->error->simulated = gt->simulated;
2958
2959         /* Publish the error state, and announce it to the world */
2960         i915_error_state_store(cap->error);
2961         i915_gpu_coredump_put(cap->error);
2962
2963         /* Return this request and all that depend upon it for signaling */
2964         execlists_unhold(engine, cap->rq);
2965         i915_request_put(cap->rq);
2966
2967         kfree(cap);
2968 }
2969
2970 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2971 {
2972         const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2973         struct execlists_capture *cap;
2974
2975         cap = kmalloc(sizeof(*cap), gfp);
2976         if (!cap)
2977                 return NULL;
2978
2979         cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2980         if (!cap->error)
2981                 goto err_cap;
2982
2983         cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2984         if (!cap->error->gt)
2985                 goto err_gpu;
2986
2987         cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2988         if (!cap->error->gt->engine)
2989                 goto err_gt;
2990
2991         return cap;
2992
2993 err_gt:
2994         kfree(cap->error->gt);
2995 err_gpu:
2996         kfree(cap->error);
2997 err_cap:
2998         kfree(cap);
2999         return NULL;
3000 }
3001
3002 static struct i915_request *
3003 active_context(struct intel_engine_cs *engine, u32 ccid)
3004 {
3005         const struct intel_engine_execlists * const el = &engine->execlists;
3006         struct i915_request * const *port, *rq;
3007
3008         /*
3009          * Use the most recent result from process_csb(), but just in case
3010          * we trigger an error (via interrupt) before the first CS event has
3011          * been written, peek at the next submission.
3012          */
3013
3014         for (port = el->active; (rq = *port); port++) {
3015                 if (rq->context->lrc.ccid == ccid) {
3016                         ENGINE_TRACE(engine,
3017                                      "ccid found at active:%zd\n",
3018                                      port - el->active);
3019                         return rq;
3020                 }
3021         }
3022
3023         for (port = el->pending; (rq = *port); port++) {
3024                 if (rq->context->lrc.ccid == ccid) {
3025                         ENGINE_TRACE(engine,
3026                                      "ccid found at pending:%zd\n",
3027                                      port - el->pending);
3028                         return rq;
3029                 }
3030         }
3031
3032         ENGINE_TRACE(engine, "ccid:%x not found\n", ccid);
3033         return NULL;
3034 }
3035
3036 static u32 active_ccid(struct intel_engine_cs *engine)
3037 {
3038         return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI);
3039 }
3040
3041 static void execlists_capture(struct intel_engine_cs *engine)
3042 {
3043         struct execlists_capture *cap;
3044
3045         if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
3046                 return;
3047
3048         /*
3049          * We need to _quickly_ capture the engine state before we reset.
3050          * We are inside an atomic section (softirq) here and we are delaying
3051          * the forced preemption event.
3052          */
3053         cap = capture_regs(engine);
3054         if (!cap)
3055                 return;
3056
3057         spin_lock_irq(&engine->active.lock);
3058         cap->rq = active_context(engine, active_ccid(engine));
3059         if (cap->rq) {
3060                 cap->rq = active_request(cap->rq->context->timeline, cap->rq);
3061                 cap->rq = i915_request_get_rcu(cap->rq);
3062         }
3063         spin_unlock_irq(&engine->active.lock);
3064         if (!cap->rq)
3065                 goto err_free;
3066
3067         /*
3068          * Remove the request from the execlists queue, and take ownership
3069          * of the request. We pass it to our worker who will _slowly_ compress
3070          * all the pages the _user_ requested for debugging their batch, after
3071          * which we return it to the queue for signaling.
3072          *
3073          * By removing them from the execlists queue, we also remove the
3074          * requests from being processed by __unwind_incomplete_requests()
3075          * during the intel_engine_reset(), and so they will *not* be replayed
3076          * afterwards.
3077          *
3078          * Note that because we have not yet reset the engine at this point,
3079          * it is possible for the request that we have identified as being
3080          * guilty, did in fact complete and we will then hit an arbitration
3081          * point allowing the outstanding preemption to succeed. The likelihood
3082          * of that is very low (as capturing of the engine registers should be
3083          * fast enough to run inside an irq-off atomic section!), so we will
3084          * simply hold that request accountable for being non-preemptible
3085          * long enough to force the reset.
3086          */
3087         if (!execlists_hold(engine, cap->rq))
3088                 goto err_rq;
3089
3090         INIT_WORK(&cap->work, execlists_capture_work);
3091         schedule_work(&cap->work);
3092         return;
3093
3094 err_rq:
3095         i915_request_put(cap->rq);
3096 err_free:
3097         i915_gpu_coredump_put(cap->error);
3098         kfree(cap);
3099 }
3100
3101 static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
3102 {
3103         const unsigned int bit = I915_RESET_ENGINE + engine->id;
3104         unsigned long *lock = &engine->gt->reset.flags;
3105
3106         if (!intel_has_reset_engine(engine->gt))
3107                 return;
3108
3109         if (test_and_set_bit(bit, lock))
3110                 return;
3111
3112         ENGINE_TRACE(engine, "reset for %s\n", msg);
3113
3114         /* Mark this tasklet as disabled to avoid waiting for it to complete */
3115         tasklet_disable_nosync(&engine->execlists.tasklet);
3116
3117         ring_set_paused(engine, 1); /* Freeze the current request in place */
3118         execlists_capture(engine);
3119         intel_engine_reset(engine, msg);
3120
3121         tasklet_enable(&engine->execlists.tasklet);
3122         clear_and_wake_up_bit(bit, lock);
3123 }
3124
3125 static bool preempt_timeout(const struct intel_engine_cs *const engine)
3126 {
3127         const struct timer_list *t = &engine->execlists.preempt;
3128
3129         if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
3130                 return false;
3131
3132         if (!timer_expired(t))
3133                 return false;
3134
3135         return READ_ONCE(engine->execlists.pending[0]);
3136 }
3137
3138 /*
3139  * Check the unread Context Status Buffers and manage the submission of new
3140  * contexts to the ELSP accordingly.
3141  */
3142 static void execlists_submission_tasklet(unsigned long data)
3143 {
3144         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
3145         bool timeout = preempt_timeout(engine);
3146
3147         process_csb(engine);
3148
3149         if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
3150                 const char *msg;
3151
3152                 /* Generate the error message in priority wrt to the user! */
3153                 if (engine->execlists.error_interrupt & GENMASK(15, 0))
3154                         msg = "CS error"; /* thrown by a user payload */
3155                 else if (engine->execlists.error_interrupt & ERROR_CSB)
3156                         msg = "invalid CSB event";
3157                 else
3158                         msg = "internal error";
3159
3160                 engine->execlists.error_interrupt = 0;
3161                 execlists_reset(engine, msg);
3162         }
3163
3164         if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
3165                 unsigned long flags;
3166
3167                 spin_lock_irqsave(&engine->active.lock, flags);
3168                 __execlists_submission_tasklet(engine);
3169                 spin_unlock_irqrestore(&engine->active.lock, flags);
3170
3171                 /* Recheck after serialising with direct-submission */
3172                 if (unlikely(timeout && preempt_timeout(engine)))
3173                         execlists_reset(engine, "preemption time out");
3174         }
3175 }
3176
3177 static void __execlists_kick(struct intel_engine_execlists *execlists)
3178 {
3179         /* Kick the tasklet for some interrupt coalescing and reset handling */
3180         tasklet_hi_schedule(&execlists->tasklet);
3181 }
3182
3183 #define execlists_kick(t, member) \
3184         __execlists_kick(container_of(t, struct intel_engine_execlists, member))
3185
3186 static void execlists_timeslice(struct timer_list *timer)
3187 {
3188         execlists_kick(timer, timer);
3189 }
3190
3191 static void execlists_preempt(struct timer_list *timer)
3192 {
3193         execlists_kick(timer, preempt);
3194 }
3195
3196 static void queue_request(struct intel_engine_cs *engine,
3197                           struct i915_request *rq)
3198 {
3199         GEM_BUG_ON(!list_empty(&rq->sched.link));
3200         list_add_tail(&rq->sched.link,
3201                       i915_sched_lookup_priolist(engine, rq_prio(rq)));
3202         set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
3203 }
3204
3205 static void __submit_queue_imm(struct intel_engine_cs *engine)
3206 {
3207         struct intel_engine_execlists * const execlists = &engine->execlists;
3208
3209         if (reset_in_progress(execlists))
3210                 return; /* defer until we restart the engine following reset */
3211
3212         __execlists_submission_tasklet(engine);
3213 }
3214
3215 static void submit_queue(struct intel_engine_cs *engine,
3216                          const struct i915_request *rq)
3217 {
3218         struct intel_engine_execlists *execlists = &engine->execlists;
3219
3220         if (rq_prio(rq) <= execlists->queue_priority_hint)
3221                 return;
3222
3223         execlists->queue_priority_hint = rq_prio(rq);
3224         __submit_queue_imm(engine);
3225 }
3226
3227 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
3228                              const struct i915_request *rq)
3229 {
3230         GEM_BUG_ON(i915_request_on_hold(rq));
3231         return !list_empty(&engine->active.hold) && hold_request(rq);
3232 }
3233
3234 static void flush_csb(struct intel_engine_cs *engine)
3235 {
3236         struct intel_engine_execlists *el = &engine->execlists;
3237
3238         if (READ_ONCE(el->pending[0]) && tasklet_trylock(&el->tasklet)) {
3239                 if (!reset_in_progress(el))
3240                         process_csb(engine);
3241                 tasklet_unlock(&el->tasklet);
3242         }
3243 }
3244
3245 static void execlists_submit_request(struct i915_request *request)
3246 {
3247         struct intel_engine_cs *engine = request->engine;
3248         unsigned long flags;
3249
3250         /* Hopefully we clear execlists->pending[] to let us through */
3251         flush_csb(engine);
3252
3253         /* Will be called from irq-context when using foreign fences. */
3254         spin_lock_irqsave(&engine->active.lock, flags);
3255
3256         if (unlikely(ancestor_on_hold(engine, request))) {
3257                 RQ_TRACE(request, "ancestor on hold\n");
3258                 list_add_tail(&request->sched.link, &engine->active.hold);
3259                 i915_request_set_hold(request);
3260         } else {
3261                 queue_request(engine, request);
3262
3263                 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
3264                 GEM_BUG_ON(list_empty(&request->sched.link));
3265
3266                 submit_queue(engine, request);
3267         }
3268
3269         spin_unlock_irqrestore(&engine->active.lock, flags);
3270 }
3271
3272 static void __execlists_context_fini(struct intel_context *ce)
3273 {
3274         intel_ring_put(ce->ring);
3275         i915_vma_put(ce->state);
3276 }
3277
3278 static void execlists_context_destroy(struct kref *kref)
3279 {
3280         struct intel_context *ce = container_of(kref, typeof(*ce), ref);
3281
3282         GEM_BUG_ON(!i915_active_is_idle(&ce->active));
3283         GEM_BUG_ON(intel_context_is_pinned(ce));
3284
3285         if (ce->state)
3286                 __execlists_context_fini(ce);
3287
3288         intel_context_fini(ce);
3289         intel_context_free(ce);
3290 }
3291
3292 static void
3293 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
3294 {
3295         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3296                 return;
3297
3298         vaddr += engine->context_size;
3299
3300         memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
3301 }
3302
3303 static void
3304 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
3305 {
3306         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3307                 return;
3308
3309         vaddr += engine->context_size;
3310
3311         if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
3312                 drm_err_once(&engine->i915->drm,
3313                              "%s context redzone overwritten!\n",
3314                              engine->name);
3315 }
3316
3317 static void execlists_context_unpin(struct intel_context *ce)
3318 {
3319         check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
3320                       ce->engine);
3321
3322         i915_gem_object_unpin_map(ce->state->obj);
3323 }
3324
3325 static u32 *
3326 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
3327 {
3328         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3329                 MI_SRM_LRM_GLOBAL_GTT |
3330                 MI_LRI_LRM_CS_MMIO;
3331         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3332         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3333                 CTX_TIMESTAMP * sizeof(u32);
3334         *cs++ = 0;
3335
3336         *cs++ = MI_LOAD_REGISTER_REG |
3337                 MI_LRR_SOURCE_CS_MMIO |
3338                 MI_LRI_LRM_CS_MMIO;
3339         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3340         *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3341
3342         *cs++ = MI_LOAD_REGISTER_REG |
3343                 MI_LRR_SOURCE_CS_MMIO |
3344                 MI_LRI_LRM_CS_MMIO;
3345         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3346         *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3347
3348         return cs;
3349 }
3350
3351 static u32 *
3352 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
3353 {
3354         GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
3355
3356         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3357                 MI_SRM_LRM_GLOBAL_GTT |
3358                 MI_LRI_LRM_CS_MMIO;
3359         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3360         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3361                 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
3362         *cs++ = 0;
3363
3364         return cs;
3365 }
3366
3367 static u32 *
3368 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
3369 {
3370         GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
3371
3372         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3373                 MI_SRM_LRM_GLOBAL_GTT |
3374                 MI_LRI_LRM_CS_MMIO;
3375         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3376         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3377                 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
3378         *cs++ = 0;
3379
3380         *cs++ = MI_LOAD_REGISTER_REG |
3381                 MI_LRR_SOURCE_CS_MMIO |
3382                 MI_LRI_LRM_CS_MMIO;
3383         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3384         *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
3385
3386         return cs;
3387 }
3388
3389 static u32 *
3390 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
3391 {
3392         cs = gen12_emit_timestamp_wa(ce, cs);
3393         cs = gen12_emit_cmd_buf_wa(ce, cs);
3394         cs = gen12_emit_restore_scratch(ce, cs);
3395
3396         return cs;
3397 }
3398
3399 static u32 *
3400 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
3401 {
3402         cs = gen12_emit_timestamp_wa(ce, cs);
3403         cs = gen12_emit_restore_scratch(ce, cs);
3404
3405         return cs;
3406 }
3407
3408 static inline u32 context_wa_bb_offset(const struct intel_context *ce)
3409 {
3410         return PAGE_SIZE * ce->wa_bb_page;
3411 }
3412
3413 static u32 *context_indirect_bb(const struct intel_context *ce)
3414 {
3415         void *ptr;
3416
3417         GEM_BUG_ON(!ce->wa_bb_page);
3418
3419         ptr = ce->lrc_reg_state;
3420         ptr -= LRC_STATE_OFFSET; /* back to start of context image */
3421         ptr += context_wa_bb_offset(ce);
3422
3423         return ptr;
3424 }
3425
3426 static void
3427 setup_indirect_ctx_bb(const struct intel_context *ce,
3428                       const struct intel_engine_cs *engine,
3429                       u32 *(*emit)(const struct intel_context *, u32 *))
3430 {
3431         u32 * const start = context_indirect_bb(ce);
3432         u32 *cs;
3433
3434         cs = emit(ce, start);
3435         GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
3436         while ((unsigned long)cs % CACHELINE_BYTES)
3437                 *cs++ = MI_NOOP;
3438
3439         lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine,
3440                                     i915_ggtt_offset(ce->state) +
3441                                     context_wa_bb_offset(ce),
3442                                     (cs - start) * sizeof(*cs));
3443 }
3444
3445 static void
3446 __execlists_update_reg_state(const struct intel_context *ce,
3447                              const struct intel_engine_cs *engine,
3448                              u32 head)
3449 {
3450         struct intel_ring *ring = ce->ring;
3451         u32 *regs = ce->lrc_reg_state;
3452
3453         GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
3454         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
3455
3456         regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
3457         regs[CTX_RING_HEAD] = head;
3458         regs[CTX_RING_TAIL] = ring->tail;
3459         regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3460
3461         /* RPCS */
3462         if (engine->class == RENDER_CLASS) {
3463                 regs[CTX_R_PWR_CLK_STATE] =
3464                         intel_sseu_make_rpcs(engine->gt, &ce->sseu);
3465
3466                 i915_oa_init_reg_state(ce, engine);
3467         }
3468
3469         if (ce->wa_bb_page) {
3470                 u32 *(*fn)(const struct intel_context *ce, u32 *cs);
3471
3472                 fn = gen12_emit_indirect_ctx_xcs;
3473                 if (ce->engine->class == RENDER_CLASS)
3474                         fn = gen12_emit_indirect_ctx_rcs;
3475
3476                 /* Mutually exclusive wrt to global indirect bb */
3477                 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
3478                 setup_indirect_ctx_bb(ce, engine, fn);
3479         }
3480 }
3481
3482 static int
3483 __execlists_context_pin(struct intel_context *ce,
3484                         struct intel_engine_cs *engine)
3485 {
3486         void *vaddr;
3487
3488         GEM_BUG_ON(!ce->state);
3489         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3490
3491         vaddr = i915_gem_object_pin_map(ce->state->obj,
3492                                         i915_coherent_map_type(engine->i915) |
3493                                         I915_MAP_OVERRIDE);
3494         if (IS_ERR(vaddr))
3495                 return PTR_ERR(vaddr);
3496
3497         ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
3498         ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
3499         __execlists_update_reg_state(ce, engine, ce->ring->tail);
3500
3501         return 0;
3502 }
3503
3504 static int execlists_context_pin(struct intel_context *ce)
3505 {
3506         return __execlists_context_pin(ce, ce->engine);
3507 }
3508
3509 static int execlists_context_alloc(struct intel_context *ce)
3510 {
3511         return __execlists_context_alloc(ce, ce->engine);
3512 }
3513
3514 static void execlists_context_reset(struct intel_context *ce)
3515 {
3516         CE_TRACE(ce, "reset\n");
3517         GEM_BUG_ON(!intel_context_is_pinned(ce));
3518
3519         intel_ring_reset(ce->ring, ce->ring->emit);
3520
3521         /* Scrub away the garbage */
3522         execlists_init_reg_state(ce->lrc_reg_state,
3523                                  ce, ce->engine, ce->ring, true);
3524         __execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
3525
3526         ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
3527 }
3528
3529 static const struct intel_context_ops execlists_context_ops = {
3530         .alloc = execlists_context_alloc,
3531
3532         .pin = execlists_context_pin,
3533         .unpin = execlists_context_unpin,
3534
3535         .enter = intel_context_enter_engine,
3536         .exit = intel_context_exit_engine,
3537
3538         .reset = execlists_context_reset,
3539         .destroy = execlists_context_destroy,
3540 };
3541
3542 static u32 hwsp_offset(const struct i915_request *rq)
3543 {
3544         const struct intel_timeline_cacheline *cl;
3545
3546         /* Before the request is executed, the timeline/cachline is fixed */
3547
3548         cl = rcu_dereference_protected(rq->hwsp_cacheline, 1);
3549         if (cl)
3550                 return cl->ggtt_offset;
3551
3552         return rcu_dereference_protected(rq->timeline, 1)->hwsp_offset;
3553 }
3554
3555 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
3556 {
3557         u32 *cs;
3558
3559         GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
3560         if (!i915_request_timeline(rq)->has_initial_breadcrumb)
3561                 return 0;
3562
3563         cs = intel_ring_begin(rq, 6);
3564         if (IS_ERR(cs))
3565                 return PTR_ERR(cs);
3566
3567         /*
3568          * Check if we have been preempted before we even get started.
3569          *
3570          * After this point i915_request_started() reports true, even if
3571          * we get preempted and so are no longer running.
3572          */
3573         *cs++ = MI_ARB_CHECK;
3574         *cs++ = MI_NOOP;
3575
3576         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
3577         *cs++ = hwsp_offset(rq);
3578         *cs++ = 0;
3579         *cs++ = rq->fence.seqno - 1;
3580
3581         intel_ring_advance(rq, cs);
3582
3583         /* Record the updated position of the request's payload */
3584         rq->infix = intel_ring_offset(rq, cs);
3585
3586         __set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);
3587
3588         return 0;
3589 }
3590
3591 static int emit_pdps(struct i915_request *rq)
3592 {
3593         const struct intel_engine_cs * const engine = rq->engine;
3594         struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm);
3595         int err, i;
3596         u32 *cs;
3597
3598         GEM_BUG_ON(intel_vgpu_active(rq->engine->i915));
3599
3600         /*
3601          * Beware ye of the dragons, this sequence is magic!
3602          *
3603          * Small changes to this sequence can cause anything from
3604          * GPU hangs to forcewake errors and machine lockups!
3605          */
3606
3607         /* Flush any residual operations from the context load */
3608         err = engine->emit_flush(rq, EMIT_FLUSH);
3609         if (err)
3610                 return err;
3611
3612         /* Magic required to prevent forcewake errors! */
3613         err = engine->emit_flush(rq, EMIT_INVALIDATE);
3614         if (err)
3615                 return err;
3616
3617         cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
3618         if (IS_ERR(cs))
3619                 return PTR_ERR(cs);
3620
3621         /* Ensure the LRI have landed before we invalidate & continue */
3622         *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
3623         for (i = GEN8_3LVL_PDPES; i--; ) {
3624                 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
3625                 u32 base = engine->mmio_base;
3626
3627                 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i));
3628                 *cs++ = upper_32_bits(pd_daddr);
3629                 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i));
3630                 *cs++ = lower_32_bits(pd_daddr);
3631         }
3632         *cs++ = MI_NOOP;
3633
3634         intel_ring_advance(rq, cs);
3635
3636         return 0;
3637 }
3638
3639 static int execlists_request_alloc(struct i915_request *request)
3640 {
3641         int ret;
3642
3643         GEM_BUG_ON(!intel_context_is_pinned(request->context));
3644
3645         /*
3646          * Flush enough space to reduce the likelihood of waiting after
3647          * we start building the request - in which case we will just
3648          * have to repeat work.
3649          */
3650         request->reserved_space += EXECLISTS_REQUEST_SIZE;
3651
3652         /*
3653          * Note that after this point, we have committed to using
3654          * this request as it is being used to both track the
3655          * state of engine initialisation and liveness of the
3656          * golden renderstate above. Think twice before you try
3657          * to cancel/unwind this request now.
3658          */
3659
3660         if (!i915_vm_is_4lvl(request->context->vm)) {
3661                 ret = emit_pdps(request);
3662                 if (ret)
3663                         return ret;
3664         }
3665
3666         /* Unconditionally invalidate GPU caches and TLBs. */
3667         ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3668         if (ret)
3669                 return ret;
3670
3671         request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3672         return 0;
3673 }
3674
3675 /*
3676  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3677  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3678  * but there is a slight complication as this is applied in WA batch where the
3679  * values are only initialized once so we cannot take register value at the
3680  * beginning and reuse it further; hence we save its value to memory, upload a
3681  * constant value with bit21 set and then we restore it back with the saved value.
3682  * To simplify the WA, a constant value is formed by using the default value
3683  * of this register. This shouldn't be a problem because we are only modifying
3684  * it for a short period and this batch in non-premptible. We can ofcourse
3685  * use additional instructions that read the actual value of the register
3686  * at that time and set our bit of interest but it makes the WA complicated.
3687  *
3688  * This WA is also required for Gen9 so extracting as a function avoids
3689  * code duplication.
3690  */
3691 static u32 *
3692 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3693 {
3694         /* NB no one else is allowed to scribble over scratch + 256! */
3695         *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3696         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3697         *batch++ = intel_gt_scratch_offset(engine->gt,
3698                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3699         *batch++ = 0;
3700
3701         *batch++ = MI_LOAD_REGISTER_IMM(1);
3702         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3703         *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3704
3705         batch = gen8_emit_pipe_control(batch,
3706                                        PIPE_CONTROL_CS_STALL |
3707                                        PIPE_CONTROL_DC_FLUSH_ENABLE,
3708                                        0);
3709
3710         *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3711         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3712         *batch++ = intel_gt_scratch_offset(engine->gt,
3713                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3714         *batch++ = 0;
3715
3716         return batch;
3717 }
3718
3719 /*
3720  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3721  * initialized at the beginning and shared across all contexts but this field
3722  * helps us to have multiple batches at different offsets and select them based
3723  * on a criteria. At the moment this batch always start at the beginning of the page
3724  * and at this point we don't have multiple wa_ctx batch buffers.
3725  *
3726  * The number of WA applied are not known at the beginning; we use this field
3727  * to return the no of DWORDS written.
3728  *
3729  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3730  * so it adds NOOPs as padding to make it cacheline aligned.
3731  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3732  * makes a complete batch buffer.
3733  */
3734 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3735 {
3736         /* WaDisableCtxRestoreArbitration:bdw,chv */
3737         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3738
3739         /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3740         if (IS_BROADWELL(engine->i915))
3741                 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3742
3743         /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3744         /* Actual scratch location is at 128 bytes offset */
3745         batch = gen8_emit_pipe_control(batch,
3746                                        PIPE_CONTROL_FLUSH_L3 |
3747                                        PIPE_CONTROL_STORE_DATA_INDEX |
3748                                        PIPE_CONTROL_CS_STALL |
3749                                        PIPE_CONTROL_QW_WRITE,
3750                                        LRC_PPHWSP_SCRATCH_ADDR);
3751
3752         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3753
3754         /* Pad to end of cacheline */
3755         while ((unsigned long)batch % CACHELINE_BYTES)
3756                 *batch++ = MI_NOOP;
3757
3758         /*
3759          * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3760          * execution depends on the length specified in terms of cache lines
3761          * in the register CTX_RCS_INDIRECT_CTX
3762          */
3763
3764         return batch;
3765 }
3766
3767 struct lri {
3768         i915_reg_t reg;
3769         u32 value;
3770 };
3771
3772 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3773 {
3774         GEM_BUG_ON(!count || count > 63);
3775
3776         *batch++ = MI_LOAD_REGISTER_IMM(count);
3777         do {
3778                 *batch++ = i915_mmio_reg_offset(lri->reg);
3779                 *batch++ = lri->value;
3780         } while (lri++, --count);
3781         *batch++ = MI_NOOP;
3782
3783         return batch;
3784 }
3785
3786 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3787 {
3788         static const struct lri lri[] = {
3789                 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3790                 {
3791                         COMMON_SLICE_CHICKEN2,
3792                         __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3793                                        0),
3794                 },
3795
3796                 /* BSpec: 11391 */
3797                 {
3798                         FF_SLICE_CHICKEN,
3799                         __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3800                                        FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3801                 },
3802
3803                 /* BSpec: 11299 */
3804                 {
3805                         _3D_CHICKEN3,
3806                         __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3807                                        _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3808                 }
3809         };
3810
3811         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3812
3813         /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3814         batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3815
3816         /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3817         batch = gen8_emit_pipe_control(batch,
3818                                        PIPE_CONTROL_FLUSH_L3 |
3819                                        PIPE_CONTROL_STORE_DATA_INDEX |
3820                                        PIPE_CONTROL_CS_STALL |
3821                                        PIPE_CONTROL_QW_WRITE,
3822                                        LRC_PPHWSP_SCRATCH_ADDR);
3823
3824         batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3825
3826         /* WaMediaPoolStateCmdInWABB:bxt,glk */
3827         if (HAS_POOLED_EU(engine->i915)) {
3828                 /*
3829                  * EU pool configuration is setup along with golden context
3830                  * during context initialization. This value depends on
3831                  * device type (2x6 or 3x6) and needs to be updated based
3832                  * on which subslice is disabled especially for 2x6
3833                  * devices, however it is safe to load default
3834                  * configuration of 3x6 device instead of masking off
3835                  * corresponding bits because HW ignores bits of a disabled
3836                  * subslice and drops down to appropriate config. Please
3837                  * see render_state_setup() in i915_gem_render_state.c for
3838                  * possible configurations, to avoid duplication they are
3839                  * not shown here again.
3840                  */
3841                 *batch++ = GEN9_MEDIA_POOL_STATE;
3842                 *batch++ = GEN9_MEDIA_POOL_ENABLE;
3843                 *batch++ = 0x00777000;
3844                 *batch++ = 0;
3845                 *batch++ = 0;
3846                 *batch++ = 0;
3847         }
3848
3849         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3850
3851         /* Pad to end of cacheline */
3852         while ((unsigned long)batch % CACHELINE_BYTES)
3853                 *batch++ = MI_NOOP;
3854
3855         return batch;
3856 }
3857
3858 static u32 *
3859 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3860 {
3861         int i;
3862
3863         /*
3864          * WaPipeControlBefore3DStateSamplePattern: cnl
3865          *
3866          * Ensure the engine is idle prior to programming a
3867          * 3DSTATE_SAMPLE_PATTERN during a context restore.
3868          */
3869         batch = gen8_emit_pipe_control(batch,
3870                                        PIPE_CONTROL_CS_STALL,
3871                                        0);
3872         /*
3873          * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3874          * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3875          * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3876          * confusing. Since gen8_emit_pipe_control() already advances the
3877          * batch by 6 dwords, we advance the other 10 here, completing a
3878          * cacheline. It's not clear if the workaround requires this padding
3879          * before other commands, or if it's just the regular padding we would
3880          * already have for the workaround bb, so leave it here for now.
3881          */
3882         for (i = 0; i < 10; i++)
3883                 *batch++ = MI_NOOP;
3884
3885         /* Pad to end of cacheline */
3886         while ((unsigned long)batch % CACHELINE_BYTES)
3887                 *batch++ = MI_NOOP;
3888
3889         return batch;
3890 }
3891
3892 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3893
3894 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3895 {
3896         struct drm_i915_gem_object *obj;
3897         struct i915_vma *vma;
3898         int err;
3899
3900         obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3901         if (IS_ERR(obj))
3902                 return PTR_ERR(obj);
3903
3904         vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3905         if (IS_ERR(vma)) {
3906                 err = PTR_ERR(vma);
3907                 goto err;
3908         }
3909
3910         err = i915_ggtt_pin(vma, 0, PIN_HIGH);
3911         if (err)
3912                 goto err;
3913
3914         engine->wa_ctx.vma = vma;
3915         return 0;
3916
3917 err:
3918         i915_gem_object_put(obj);
3919         return err;
3920 }
3921
3922 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3923 {
3924         i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3925 }
3926
3927 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3928
3929 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3930 {
3931         struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3932         struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3933                                             &wa_ctx->per_ctx };
3934         wa_bb_func_t wa_bb_fn[2];
3935         void *batch, *batch_ptr;
3936         unsigned int i;
3937         int ret;
3938
3939         if (engine->class != RENDER_CLASS)
3940                 return 0;
3941
3942         switch (INTEL_GEN(engine->i915)) {
3943         case 12:
3944         case 11:
3945                 return 0;
3946         case 10:
3947                 wa_bb_fn[0] = gen10_init_indirectctx_bb;
3948                 wa_bb_fn[1] = NULL;
3949                 break;
3950         case 9:
3951                 wa_bb_fn[0] = gen9_init_indirectctx_bb;
3952                 wa_bb_fn[1] = NULL;
3953                 break;
3954         case 8:
3955                 wa_bb_fn[0] = gen8_init_indirectctx_bb;
3956                 wa_bb_fn[1] = NULL;
3957                 break;
3958         default:
3959                 MISSING_CASE(INTEL_GEN(engine->i915));
3960                 return 0;
3961         }
3962
3963         ret = lrc_setup_wa_ctx(engine);
3964         if (ret) {
3965                 drm_dbg(&engine->i915->drm,
3966                         "Failed to setup context WA page: %d\n", ret);
3967                 return ret;
3968         }
3969
3970         batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
3971
3972         /*
3973          * Emit the two workaround batch buffers, recording the offset from the
3974          * start of the workaround batch buffer object for each and their
3975          * respective sizes.
3976          */
3977         batch_ptr = batch;
3978         for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3979                 wa_bb[i]->offset = batch_ptr - batch;
3980                 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3981                                                   CACHELINE_BYTES))) {
3982                         ret = -EINVAL;
3983                         break;
3984                 }
3985                 if (wa_bb_fn[i])
3986                         batch_ptr = wa_bb_fn[i](engine, batch_ptr);
3987                 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
3988         }
3989         GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
3990
3991         __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
3992         __i915_gem_object_release_map(wa_ctx->vma->obj);
3993         if (ret)
3994                 lrc_destroy_wa_ctx(engine);
3995
3996         return ret;
3997 }
3998
3999 static void reset_csb_pointers(struct intel_engine_cs *engine)
4000 {
4001         struct intel_engine_execlists * const execlists = &engine->execlists;
4002         const unsigned int reset_value = execlists->csb_size - 1;
4003
4004         ring_set_paused(engine, 0);
4005
4006         /*
4007          * Sometimes Icelake forgets to reset its pointers on a GPU reset.
4008          * Bludgeon them with a mmio update to be sure.
4009          */
4010         ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
4011                      0xffff << 16 | reset_value << 8 | reset_value);
4012         ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
4013
4014         /*
4015          * After a reset, the HW starts writing into CSB entry [0]. We
4016          * therefore have to set our HEAD pointer back one entry so that
4017          * the *first* entry we check is entry 0. To complicate this further,
4018          * as we don't wait for the first interrupt after reset, we have to
4019          * fake the HW write to point back to the last entry so that our
4020          * inline comparison of our cached head position against the last HW
4021          * write works even before the first interrupt.
4022          */
4023         execlists->csb_head = reset_value;
4024         WRITE_ONCE(*execlists->csb_write, reset_value);
4025         wmb(); /* Make sure this is visible to HW (paranoia?) */
4026
4027         invalidate_csb_entries(&execlists->csb_status[0],
4028                                &execlists->csb_status[reset_value]);
4029
4030         /* Once more for luck and our trusty paranoia */
4031         ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
4032                      0xffff << 16 | reset_value << 8 | reset_value);
4033         ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
4034
4035         GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value);
4036 }
4037
4038 static void execlists_sanitize(struct intel_engine_cs *engine)
4039 {
4040         /*
4041          * Poison residual state on resume, in case the suspend didn't!
4042          *
4043          * We have to assume that across suspend/resume (or other loss
4044          * of control) that the contents of our pinned buffers has been
4045          * lost, replaced by garbage. Since this doesn't always happen,
4046          * let's poison such state so that we more quickly spot when
4047          * we falsely assume it has been preserved.
4048          */
4049         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4050                 memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE);
4051
4052         reset_csb_pointers(engine);
4053
4054         /*
4055          * The kernel_context HWSP is stored in the status_page. As above,
4056          * that may be lost on resume/initialisation, and so we need to
4057          * reset the value in the HWSP.
4058          */
4059         intel_timeline_reset_seqno(engine->kernel_context->timeline);
4060
4061         /* And scrub the dirty cachelines for the HWSP */
4062         clflush_cache_range(engine->status_page.addr, PAGE_SIZE);
4063 }
4064
4065 static void enable_error_interrupt(struct intel_engine_cs *engine)
4066 {
4067         u32 status;
4068
4069         engine->execlists.error_interrupt = 0;
4070         ENGINE_WRITE(engine, RING_EMR, ~0u);
4071         ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
4072
4073         status = ENGINE_READ(engine, RING_ESR);
4074         if (unlikely(status)) {
4075                 drm_err(&engine->i915->drm,
4076                         "engine '%s' resumed still in error: %08x\n",
4077                         engine->name, status);
4078                 __intel_gt_reset(engine->gt, engine->mask);
4079         }
4080
4081         /*
4082          * On current gen8+, we have 2 signals to play with
4083          *
4084          * - I915_ERROR_INSTUCTION (bit 0)
4085          *
4086          *    Generate an error if the command parser encounters an invalid
4087          *    instruction
4088          *
4089          *    This is a fatal error.
4090          *
4091          * - CP_PRIV (bit 2)
4092          *
4093          *    Generate an error on privilege violation (where the CP replaces
4094          *    the instruction with a no-op). This also fires for writes into
4095          *    read-only scratch pages.
4096          *
4097          *    This is a non-fatal error, parsing continues.
4098          *
4099          * * there are a few others defined for odd HW that we do not use
4100          *
4101          * Since CP_PRIV fires for cases where we have chosen to ignore the
4102          * error (as the HW is validating and suppressing the mistakes), we
4103          * only unmask the instruction error bit.
4104          */
4105         ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
4106 }
4107
4108 static void enable_execlists(struct intel_engine_cs *engine)
4109 {
4110         u32 mode;
4111
4112         assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
4113
4114         intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
4115
4116         if (INTEL_GEN(engine->i915) >= 11)
4117                 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
4118         else
4119                 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
4120         ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
4121
4122         ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
4123
4124         ENGINE_WRITE_FW(engine,
4125                         RING_HWS_PGA,
4126                         i915_ggtt_offset(engine->status_page.vma));
4127         ENGINE_POSTING_READ(engine, RING_HWS_PGA);
4128
4129         enable_error_interrupt(engine);
4130
4131         engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0);
4132 }
4133
4134 static bool unexpected_starting_state(struct intel_engine_cs *engine)
4135 {
4136         bool unexpected = false;
4137
4138         if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
4139                 drm_dbg(&engine->i915->drm,
4140                         "STOP_RING still set in RING_MI_MODE\n");
4141                 unexpected = true;
4142         }
4143
4144         return unexpected;
4145 }
4146
4147 static int execlists_resume(struct intel_engine_cs *engine)
4148 {
4149         intel_mocs_init_engine(engine);
4150
4151         intel_engine_reset_breadcrumbs(engine);
4152
4153         if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
4154                 struct drm_printer p = drm_debug_printer(__func__);
4155
4156                 intel_engine_dump(engine, &p, NULL);
4157         }
4158
4159         enable_execlists(engine);
4160
4161         return 0;
4162 }
4163
4164 static void execlists_reset_prepare(struct intel_engine_cs *engine)
4165 {
4166         struct intel_engine_execlists * const execlists = &engine->execlists;
4167         unsigned long flags;
4168
4169         ENGINE_TRACE(engine, "depth<-%d\n",
4170                      atomic_read(&execlists->tasklet.count));
4171
4172         /*
4173          * Prevent request submission to the hardware until we have
4174          * completed the reset in i915_gem_reset_finish(). If a request
4175          * is completed by one engine, it may then queue a request
4176          * to a second via its execlists->tasklet *just* as we are
4177          * calling engine->resume() and also writing the ELSP.
4178          * Turning off the execlists->tasklet until the reset is over
4179          * prevents the race.
4180          */
4181         __tasklet_disable_sync_once(&execlists->tasklet);
4182         GEM_BUG_ON(!reset_in_progress(execlists));
4183
4184         /* And flush any current direct submission. */
4185         spin_lock_irqsave(&engine->active.lock, flags);
4186         spin_unlock_irqrestore(&engine->active.lock, flags);
4187
4188         /*
4189          * We stop engines, otherwise we might get failed reset and a
4190          * dead gpu (on elk). Also as modern gpu as kbl can suffer
4191          * from system hang if batchbuffer is progressing when
4192          * the reset is issued, regardless of READY_TO_RESET ack.
4193          * Thus assume it is best to stop engines on all gens
4194          * where we have a gpu reset.
4195          *
4196          * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
4197          *
4198          * FIXME: Wa for more modern gens needs to be validated
4199          */
4200         ring_set_paused(engine, 1);
4201         intel_engine_stop_cs(engine);
4202
4203         engine->execlists.reset_ccid = active_ccid(engine);
4204 }
4205
4206 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
4207 {
4208         int x;
4209
4210         x = lrc_ring_mi_mode(engine);
4211         if (x != -1) {
4212                 regs[x + 1] &= ~STOP_RING;
4213                 regs[x + 1] |= STOP_RING << 16;
4214         }
4215 }
4216
4217 static void __execlists_reset_reg_state(const struct intel_context *ce,
4218                                         const struct intel_engine_cs *engine)
4219 {
4220         u32 *regs = ce->lrc_reg_state;
4221
4222         __reset_stop_ring(regs, engine);
4223 }
4224
4225 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
4226 {
4227         struct intel_engine_execlists * const execlists = &engine->execlists;
4228         struct intel_context *ce;
4229         struct i915_request *rq;
4230         u32 head;
4231
4232         mb(); /* paranoia: read the CSB pointers from after the reset */
4233         clflush(execlists->csb_write);
4234         mb();
4235
4236         process_csb(engine); /* drain preemption events */
4237
4238         /* Following the reset, we need to reload the CSB read/write pointers */
4239         reset_csb_pointers(engine);
4240
4241         /*
4242          * Save the currently executing context, even if we completed
4243          * its request, it was still running at the time of the
4244          * reset and will have been clobbered.
4245          */
4246         rq = active_context(engine, engine->execlists.reset_ccid);
4247         if (!rq)
4248                 goto unwind;
4249
4250         ce = rq->context;
4251         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
4252
4253         if (i915_request_completed(rq)) {
4254                 /* Idle context; tidy up the ring so we can restart afresh */
4255                 head = intel_ring_wrap(ce->ring, rq->tail);
4256                 goto out_replay;
4257         }
4258
4259         /* We still have requests in-flight; the engine should be active */
4260         GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
4261
4262         /* Context has requests still in-flight; it should not be idle! */
4263         GEM_BUG_ON(i915_active_is_idle(&ce->active));
4264
4265         rq = active_request(ce->timeline, rq);
4266         head = intel_ring_wrap(ce->ring, rq->head);
4267         GEM_BUG_ON(head == ce->ring->tail);
4268
4269         /*
4270          * If this request hasn't started yet, e.g. it is waiting on a
4271          * semaphore, we need to avoid skipping the request or else we
4272          * break the signaling chain. However, if the context is corrupt
4273          * the request will not restart and we will be stuck with a wedged
4274          * device. It is quite often the case that if we issue a reset
4275          * while the GPU is loading the context image, that the context
4276          * image becomes corrupt.
4277          *
4278          * Otherwise, if we have not started yet, the request should replay
4279          * perfectly and we do not need to flag the result as being erroneous.
4280          */
4281         if (!i915_request_started(rq))
4282                 goto out_replay;
4283
4284         /*
4285          * If the request was innocent, we leave the request in the ELSP
4286          * and will try to replay it on restarting. The context image may
4287          * have been corrupted by the reset, in which case we may have
4288          * to service a new GPU hang, but more likely we can continue on
4289          * without impact.
4290          *
4291          * If the request was guilty, we presume the context is corrupt
4292          * and have to at least restore the RING register in the context
4293          * image back to the expected values to skip over the guilty request.
4294          */
4295         __i915_request_reset(rq, stalled);
4296
4297         /*
4298          * We want a simple context + ring to execute the breadcrumb update.
4299          * We cannot rely on the context being intact across the GPU hang,
4300          * so clear it and rebuild just what we need for the breadcrumb.
4301          * All pending requests for this context will be zapped, and any
4302          * future request will be after userspace has had the opportunity
4303          * to recreate its own state.
4304          */
4305 out_replay:
4306         ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
4307                      head, ce->ring->tail);
4308         __execlists_reset_reg_state(ce, engine);
4309         __execlists_update_reg_state(ce, engine, head);
4310         ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
4311
4312 unwind:
4313         /* Push back any incomplete requests for replay after the reset. */
4314         cancel_port_requests(execlists);
4315         __unwind_incomplete_requests(engine);
4316 }
4317
4318 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
4319 {
4320         unsigned long flags;
4321
4322         ENGINE_TRACE(engine, "\n");
4323
4324         spin_lock_irqsave(&engine->active.lock, flags);
4325
4326         __execlists_reset(engine, stalled);
4327
4328         spin_unlock_irqrestore(&engine->active.lock, flags);
4329 }
4330
4331 static void nop_submission_tasklet(unsigned long data)
4332 {
4333         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
4334
4335         /* The driver is wedged; don't process any more events. */
4336         WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN);
4337 }
4338
4339 static void execlists_reset_cancel(struct intel_engine_cs *engine)
4340 {
4341         struct intel_engine_execlists * const execlists = &engine->execlists;
4342         struct i915_request *rq, *rn;
4343         struct rb_node *rb;
4344         unsigned long flags;
4345
4346         ENGINE_TRACE(engine, "\n");
4347
4348         /*
4349          * Before we call engine->cancel_requests(), we should have exclusive
4350          * access to the submission state. This is arranged for us by the
4351          * caller disabling the interrupt generation, the tasklet and other
4352          * threads that may then access the same state, giving us a free hand
4353          * to reset state. However, we still need to let lockdep be aware that
4354          * we know this state may be accessed in hardirq context, so we
4355          * disable the irq around this manipulation and we want to keep
4356          * the spinlock focused on its duties and not accidentally conflate
4357          * coverage to the submission's irq state. (Similarly, although we
4358          * shouldn't need to disable irq around the manipulation of the
4359          * submission's irq state, we also wish to remind ourselves that
4360          * it is irq state.)
4361          */
4362         spin_lock_irqsave(&engine->active.lock, flags);
4363
4364         __execlists_reset(engine, true);
4365
4366         /* Mark all executing requests as skipped. */
4367         list_for_each_entry(rq, &engine->active.requests, sched.link)
4368                 mark_eio(rq);
4369
4370         /* Flush the queued requests to the timeline list (for retiring). */
4371         while ((rb = rb_first_cached(&execlists->queue))) {
4372                 struct i915_priolist *p = to_priolist(rb);
4373                 int i;
4374
4375                 priolist_for_each_request_consume(rq, rn, p, i) {
4376                         mark_eio(rq);
4377                         __i915_request_submit(rq);
4378                 }
4379
4380                 rb_erase_cached(&p->node, &execlists->queue);
4381                 i915_priolist_free(p);
4382         }
4383
4384         /* On-hold requests will be flushed to timeline upon their release */
4385         list_for_each_entry(rq, &engine->active.hold, sched.link)
4386                 mark_eio(rq);
4387
4388         /* Cancel all attached virtual engines */
4389         while ((rb = rb_first_cached(&execlists->virtual))) {
4390                 struct virtual_engine *ve =
4391                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4392
4393                 rb_erase_cached(rb, &execlists->virtual);
4394                 RB_CLEAR_NODE(rb);
4395
4396                 spin_lock(&ve->base.active.lock);
4397                 rq = fetch_and_zero(&ve->request);
4398                 if (rq) {
4399                         mark_eio(rq);
4400
4401                         rq->engine = engine;
4402                         __i915_request_submit(rq);
4403                         i915_request_put(rq);
4404
4405                         ve->base.execlists.queue_priority_hint = INT_MIN;
4406                 }
4407                 spin_unlock(&ve->base.active.lock);
4408         }
4409
4410         /* Remaining _unready_ requests will be nop'ed when submitted */
4411
4412         execlists->queue_priority_hint = INT_MIN;
4413         execlists->queue = RB_ROOT_CACHED;
4414
4415         GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
4416         execlists->tasklet.func = nop_submission_tasklet;
4417
4418         spin_unlock_irqrestore(&engine->active.lock, flags);
4419 }
4420
4421 static void execlists_reset_finish(struct intel_engine_cs *engine)
4422 {
4423         struct intel_engine_execlists * const execlists = &engine->execlists;
4424
4425         /*
4426          * After a GPU reset, we may have requests to replay. Do so now while
4427          * we still have the forcewake to be sure that the GPU is not allowed
4428          * to sleep before we restart and reload a context.
4429          */
4430         GEM_BUG_ON(!reset_in_progress(execlists));
4431         if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
4432                 execlists->tasklet.func(execlists->tasklet.data);
4433
4434         if (__tasklet_enable(&execlists->tasklet))
4435                 /* And kick in case we missed a new request submission. */
4436                 tasklet_hi_schedule(&execlists->tasklet);
4437         ENGINE_TRACE(engine, "depth->%d\n",
4438                      atomic_read(&execlists->tasklet.count));
4439 }
4440
4441 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
4442                                     u64 offset, u32 len,
4443                                     const unsigned int flags)
4444 {
4445         u32 *cs;
4446
4447         cs = intel_ring_begin(rq, 4);
4448         if (IS_ERR(cs))
4449                 return PTR_ERR(cs);
4450
4451         /*
4452          * WaDisableCtxRestoreArbitration:bdw,chv
4453          *
4454          * We don't need to perform MI_ARB_ENABLE as often as we do (in
4455          * particular all the gen that do not need the w/a at all!), if we
4456          * took care to make sure that on every switch into this context
4457          * (both ordinary and for preemption) that arbitrartion was enabled
4458          * we would be fine.  However, for gen8 there is another w/a that
4459          * requires us to not preempt inside GPGPU execution, so we keep
4460          * arbitration disabled for gen8 batches. Arbitration will be
4461          * re-enabled before we close the request
4462          * (engine->emit_fini_breadcrumb).
4463          */
4464         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4465
4466         /* FIXME(BDW+): Address space and security selectors. */
4467         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
4468                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4469         *cs++ = lower_32_bits(offset);
4470         *cs++ = upper_32_bits(offset);
4471
4472         intel_ring_advance(rq, cs);
4473
4474         return 0;
4475 }
4476
4477 static int gen8_emit_bb_start(struct i915_request *rq,
4478                               u64 offset, u32 len,
4479                               const unsigned int flags)
4480 {
4481         u32 *cs;
4482
4483         cs = intel_ring_begin(rq, 6);
4484         if (IS_ERR(cs))
4485                 return PTR_ERR(cs);
4486
4487         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4488
4489         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
4490                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4491         *cs++ = lower_32_bits(offset);
4492         *cs++ = upper_32_bits(offset);
4493
4494         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4495         *cs++ = MI_NOOP;
4496
4497         intel_ring_advance(rq, cs);
4498
4499         return 0;
4500 }
4501
4502 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
4503 {
4504         ENGINE_WRITE(engine, RING_IMR,
4505                      ~(engine->irq_enable_mask | engine->irq_keep_mask));
4506         ENGINE_POSTING_READ(engine, RING_IMR);
4507 }
4508
4509 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
4510 {
4511         ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
4512 }
4513
4514 static int gen8_emit_flush(struct i915_request *request, u32 mode)
4515 {
4516         u32 cmd, *cs;
4517
4518         cs = intel_ring_begin(request, 4);
4519         if (IS_ERR(cs))
4520                 return PTR_ERR(cs);
4521
4522         cmd = MI_FLUSH_DW + 1;
4523
4524         /* We always require a command barrier so that subsequent
4525          * commands, such as breadcrumb interrupts, are strictly ordered
4526          * wrt the contents of the write cache being flushed to memory
4527          * (and thus being coherent from the CPU).
4528          */
4529         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4530
4531         if (mode & EMIT_INVALIDATE) {
4532                 cmd |= MI_INVALIDATE_TLB;
4533                 if (request->engine->class == VIDEO_DECODE_CLASS)
4534                         cmd |= MI_INVALIDATE_BSD;
4535         }
4536
4537         *cs++ = cmd;
4538         *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4539         *cs++ = 0; /* upper addr */
4540         *cs++ = 0; /* value */
4541         intel_ring_advance(request, cs);
4542
4543         return 0;
4544 }
4545
4546 static int gen8_emit_flush_render(struct i915_request *request,
4547                                   u32 mode)
4548 {
4549         bool vf_flush_wa = false, dc_flush_wa = false;
4550         u32 *cs, flags = 0;
4551         int len;
4552
4553         flags |= PIPE_CONTROL_CS_STALL;
4554
4555         if (mode & EMIT_FLUSH) {
4556                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4557                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4558                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4559                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4560         }
4561
4562         if (mode & EMIT_INVALIDATE) {
4563                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4564                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4565                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4566                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4567                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4568                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4569                 flags |= PIPE_CONTROL_QW_WRITE;
4570                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4571
4572                 /*
4573                  * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
4574                  * pipe control.
4575                  */
4576                 if (IS_GEN(request->engine->i915, 9))
4577                         vf_flush_wa = true;
4578
4579                 /* WaForGAMHang:kbl */
4580                 if (IS_KBL_REVID(request->engine->i915, 0, KBL_REVID_B0))
4581                         dc_flush_wa = true;
4582         }
4583
4584         len = 6;
4585
4586         if (vf_flush_wa)
4587                 len += 6;
4588
4589         if (dc_flush_wa)
4590                 len += 12;
4591
4592         cs = intel_ring_begin(request, len);
4593         if (IS_ERR(cs))
4594                 return PTR_ERR(cs);
4595
4596         if (vf_flush_wa)
4597                 cs = gen8_emit_pipe_control(cs, 0, 0);
4598
4599         if (dc_flush_wa)
4600                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
4601                                             0);
4602
4603         cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4604
4605         if (dc_flush_wa)
4606                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
4607
4608         intel_ring_advance(request, cs);
4609
4610         return 0;
4611 }
4612
4613 static int gen11_emit_flush_render(struct i915_request *request,
4614                                    u32 mode)
4615 {
4616         if (mode & EMIT_FLUSH) {
4617                 u32 *cs;
4618                 u32 flags = 0;
4619
4620                 flags |= PIPE_CONTROL_CS_STALL;
4621
4622                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4623                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4624                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4625                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4626                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4627                 flags |= PIPE_CONTROL_QW_WRITE;
4628                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4629
4630                 cs = intel_ring_begin(request, 6);
4631                 if (IS_ERR(cs))
4632                         return PTR_ERR(cs);
4633
4634                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4635                 intel_ring_advance(request, cs);
4636         }
4637
4638         if (mode & EMIT_INVALIDATE) {
4639                 u32 *cs;
4640                 u32 flags = 0;
4641
4642                 flags |= PIPE_CONTROL_CS_STALL;
4643
4644                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4645                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4646                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4647                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4648                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4649                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4650                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4651                 flags |= PIPE_CONTROL_QW_WRITE;
4652                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4653
4654                 cs = intel_ring_begin(request, 6);
4655                 if (IS_ERR(cs))
4656                         return PTR_ERR(cs);
4657
4658                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4659                 intel_ring_advance(request, cs);
4660         }
4661
4662         return 0;
4663 }
4664
4665 static u32 preparser_disable(bool state)
4666 {
4667         return MI_ARB_CHECK | 1 << 8 | state;
4668 }
4669
4670 static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
4671 {
4672         static const i915_reg_t vd[] = {
4673                 GEN12_VD0_AUX_NV,
4674                 GEN12_VD1_AUX_NV,
4675                 GEN12_VD2_AUX_NV,
4676                 GEN12_VD3_AUX_NV,
4677         };
4678
4679         static const i915_reg_t ve[] = {
4680                 GEN12_VE0_AUX_NV,
4681                 GEN12_VE1_AUX_NV,
4682         };
4683
4684         if (engine->class == VIDEO_DECODE_CLASS)
4685                 return vd[engine->instance];
4686
4687         if (engine->class == VIDEO_ENHANCEMENT_CLASS)
4688                 return ve[engine->instance];
4689
4690         GEM_BUG_ON("unknown aux_inv_reg\n");
4691
4692         return INVALID_MMIO_REG;
4693 }
4694
4695 static u32 *
4696 gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
4697 {
4698         *cs++ = MI_LOAD_REGISTER_IMM(1);
4699         *cs++ = i915_mmio_reg_offset(inv_reg);
4700         *cs++ = AUX_INV;
4701         *cs++ = MI_NOOP;
4702
4703         return cs;
4704 }
4705
4706 static int gen12_emit_flush_render(struct i915_request *request,
4707                                    u32 mode)
4708 {
4709         if (mode & EMIT_FLUSH) {
4710                 u32 flags = 0;
4711                 u32 *cs;
4712
4713                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4714                 flags |= PIPE_CONTROL_FLUSH_L3;
4715                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4716                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4717                 /* Wa_1409600907:tgl */
4718                 flags |= PIPE_CONTROL_DEPTH_STALL;
4719                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4720                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4721
4722                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4723                 flags |= PIPE_CONTROL_QW_WRITE;
4724
4725                 flags |= PIPE_CONTROL_CS_STALL;
4726
4727                 cs = intel_ring_begin(request, 6);
4728                 if (IS_ERR(cs))
4729                         return PTR_ERR(cs);
4730
4731                 cs = gen12_emit_pipe_control(cs,
4732                                              PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
4733                                              flags, LRC_PPHWSP_SCRATCH_ADDR);
4734                 intel_ring_advance(request, cs);
4735         }
4736
4737         if (mode & EMIT_INVALIDATE) {
4738                 u32 flags = 0;
4739                 u32 *cs;
4740
4741                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4742                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4743                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4744                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4745                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4746                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4747                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4748
4749                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4750                 flags |= PIPE_CONTROL_QW_WRITE;
4751
4752                 flags |= PIPE_CONTROL_CS_STALL;
4753
4754                 cs = intel_ring_begin(request, 8 + 4);
4755                 if (IS_ERR(cs))
4756                         return PTR_ERR(cs);
4757
4758                 /*
4759                  * Prevent the pre-parser from skipping past the TLB
4760                  * invalidate and loading a stale page for the batch
4761                  * buffer / request payload.
4762                  */
4763                 *cs++ = preparser_disable(true);
4764
4765                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4766
4767                 /* hsdes: 1809175790 */
4768                 cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs);
4769
4770                 *cs++ = preparser_disable(false);
4771                 intel_ring_advance(request, cs);
4772         }
4773
4774         return 0;
4775 }
4776
4777 static int gen12_emit_flush(struct i915_request *request, u32 mode)
4778 {
4779         intel_engine_mask_t aux_inv = 0;
4780         u32 cmd, *cs;
4781
4782         if (mode & EMIT_INVALIDATE)
4783                 aux_inv = request->engine->mask & ~BIT(BCS0);
4784
4785         cs = intel_ring_begin(request,
4786                               4 + (aux_inv ? 2 * hweight8(aux_inv) + 2 : 0));
4787         if (IS_ERR(cs))
4788                 return PTR_ERR(cs);
4789
4790         cmd = MI_FLUSH_DW + 1;
4791
4792         /* We always require a command barrier so that subsequent
4793          * commands, such as breadcrumb interrupts, are strictly ordered
4794          * wrt the contents of the write cache being flushed to memory
4795          * (and thus being coherent from the CPU).
4796          */
4797         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4798
4799         if (mode & EMIT_INVALIDATE) {
4800                 cmd |= MI_INVALIDATE_TLB;
4801                 if (request->engine->class == VIDEO_DECODE_CLASS)
4802                         cmd |= MI_INVALIDATE_BSD;
4803         }
4804
4805         *cs++ = cmd;
4806         *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4807         *cs++ = 0; /* upper addr */
4808         *cs++ = 0; /* value */
4809
4810         if (aux_inv) { /* hsdes: 1809175790 */
4811                 struct intel_engine_cs *engine;
4812                 unsigned int tmp;
4813
4814                 *cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv));
4815                 for_each_engine_masked(engine, request->engine->gt,
4816                                        aux_inv, tmp) {
4817                         *cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
4818                         *cs++ = AUX_INV;
4819                 }
4820                 *cs++ = MI_NOOP;
4821         }
4822         intel_ring_advance(request, cs);
4823
4824         return 0;
4825 }
4826
4827 static void assert_request_valid(struct i915_request *rq)
4828 {
4829         struct intel_ring *ring __maybe_unused = rq->ring;
4830
4831         /* Can we unwind this request without appearing to go forwards? */
4832         GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
4833 }
4834
4835 /*
4836  * Reserve space for 2 NOOPs at the end of each request to be
4837  * used as a workaround for not being allowed to do lite
4838  * restore with HEAD==TAIL (WaIdleLiteRestore).
4839  */
4840 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4841 {
4842         /* Ensure there's always at least one preemption point per-request. */
4843         *cs++ = MI_ARB_CHECK;
4844         *cs++ = MI_NOOP;
4845         request->wa_tail = intel_ring_offset(request, cs);
4846
4847         /* Check that entire request is less than half the ring */
4848         assert_request_valid(request);
4849
4850         return cs;
4851 }
4852
4853 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4854 {
4855         *cs++ = MI_SEMAPHORE_WAIT |
4856                 MI_SEMAPHORE_GLOBAL_GTT |
4857                 MI_SEMAPHORE_POLL |
4858                 MI_SEMAPHORE_SAD_EQ_SDD;
4859         *cs++ = 0;
4860         *cs++ = intel_hws_preempt_address(request->engine);
4861         *cs++ = 0;
4862
4863         return cs;
4864 }
4865
4866 static __always_inline u32*
4867 gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4868 {
4869         *cs++ = MI_USER_INTERRUPT;
4870
4871         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4872         if (intel_engine_has_semaphores(request->engine))
4873                 cs = emit_preempt_busywait(request, cs);
4874
4875         request->tail = intel_ring_offset(request, cs);
4876         assert_ring_tail_valid(request->ring, request->tail);
4877
4878         return gen8_emit_wa_tail(request, cs);
4879 }
4880
4881 static u32 *emit_xcs_breadcrumb(struct i915_request *rq, u32 *cs)
4882 {
4883         return gen8_emit_ggtt_write(cs, rq->fence.seqno, hwsp_offset(rq), 0);
4884 }
4885
4886 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4887 {
4888         return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
4889 }
4890
4891 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4892 {
4893         cs = gen8_emit_pipe_control(cs,
4894                                     PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4895                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4896                                     PIPE_CONTROL_DC_FLUSH_ENABLE,
4897                                     0);
4898
4899         /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4900         cs = gen8_emit_ggtt_write_rcs(cs,
4901                                       request->fence.seqno,
4902                                       hwsp_offset(request),
4903                                       PIPE_CONTROL_FLUSH_ENABLE |
4904                                       PIPE_CONTROL_CS_STALL);
4905
4906         return gen8_emit_fini_breadcrumb_tail(request, cs);
4907 }
4908
4909 static u32 *
4910 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4911 {
4912         cs = gen8_emit_ggtt_write_rcs(cs,
4913                                       request->fence.seqno,
4914                                       hwsp_offset(request),
4915                                       PIPE_CONTROL_CS_STALL |
4916                                       PIPE_CONTROL_TILE_CACHE_FLUSH |
4917                                       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4918                                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4919                                       PIPE_CONTROL_DC_FLUSH_ENABLE |
4920                                       PIPE_CONTROL_FLUSH_ENABLE);
4921
4922         return gen8_emit_fini_breadcrumb_tail(request, cs);
4923 }
4924
4925 /*
4926  * Note that the CS instruction pre-parser will not stall on the breadcrumb
4927  * flush and will continue pre-fetching the instructions after it before the
4928  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4929  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4930  * of the next request before the memory has been flushed, we're guaranteed that
4931  * we won't access the batch itself too early.
4932  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4933  * so, if the current request is modifying an instruction in the next request on
4934  * the same intel_context, we might pre-fetch and then execute the pre-update
4935  * instruction. To avoid this, the users of self-modifying code should either
4936  * disable the parser around the code emitting the memory writes, via a new flag
4937  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4938  * the in-kernel use-cases we've opted to use a separate context, see
4939  * reloc_gpu() as an example.
4940  * All the above applies only to the instructions themselves. Non-inline data
4941  * used by the instructions is not pre-fetched.
4942  */
4943
4944 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
4945 {
4946         *cs++ = MI_SEMAPHORE_WAIT_TOKEN |
4947                 MI_SEMAPHORE_GLOBAL_GTT |
4948                 MI_SEMAPHORE_POLL |
4949                 MI_SEMAPHORE_SAD_EQ_SDD;
4950         *cs++ = 0;
4951         *cs++ = intel_hws_preempt_address(request->engine);
4952         *cs++ = 0;
4953         *cs++ = 0;
4954         *cs++ = MI_NOOP;
4955
4956         return cs;
4957 }
4958
4959 static __always_inline u32*
4960 gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4961 {
4962         *cs++ = MI_USER_INTERRUPT;
4963
4964         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4965         if (intel_engine_has_semaphores(request->engine))
4966                 cs = gen12_emit_preempt_busywait(request, cs);
4967
4968         request->tail = intel_ring_offset(request, cs);
4969         assert_ring_tail_valid(request->ring, request->tail);
4970
4971         return gen8_emit_wa_tail(request, cs);
4972 }
4973
4974 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4975 {
4976         return gen12_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
4977 }
4978
4979 static u32 *
4980 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4981 {
4982         cs = gen12_emit_ggtt_write_rcs(cs,
4983                                        request->fence.seqno,
4984                                        hwsp_offset(request),
4985                                        PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
4986                                        PIPE_CONTROL_CS_STALL |
4987                                        PIPE_CONTROL_TILE_CACHE_FLUSH |
4988                                        PIPE_CONTROL_FLUSH_L3 |
4989                                        PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4990                                        PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4991                                        /* Wa_1409600907:tgl */
4992                                        PIPE_CONTROL_DEPTH_STALL |
4993                                        PIPE_CONTROL_DC_FLUSH_ENABLE |
4994                                        PIPE_CONTROL_FLUSH_ENABLE);
4995
4996         return gen12_emit_fini_breadcrumb_tail(request, cs);
4997 }
4998
4999 static void execlists_park(struct intel_engine_cs *engine)
5000 {
5001         cancel_timer(&engine->execlists.timer);
5002         cancel_timer(&engine->execlists.preempt);
5003 }
5004
5005 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
5006 {
5007         engine->submit_request = execlists_submit_request;
5008         engine->schedule = i915_schedule;
5009         engine->execlists.tasklet.func = execlists_submission_tasklet;
5010
5011         engine->reset.prepare = execlists_reset_prepare;
5012         engine->reset.rewind = execlists_reset_rewind;
5013         engine->reset.cancel = execlists_reset_cancel;
5014         engine->reset.finish = execlists_reset_finish;
5015
5016         engine->park = execlists_park;
5017         engine->unpark = NULL;
5018
5019         engine->flags |= I915_ENGINE_SUPPORTS_STATS;
5020         if (!intel_vgpu_active(engine->i915)) {
5021                 engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
5022                 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) {
5023                         engine->flags |= I915_ENGINE_HAS_PREEMPTION;
5024                         if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION))
5025                                 engine->flags |= I915_ENGINE_HAS_TIMESLICES;
5026                 }
5027         }
5028
5029         if (INTEL_GEN(engine->i915) >= 12)
5030                 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
5031
5032         if (intel_engine_has_preemption(engine))
5033                 engine->emit_bb_start = gen8_emit_bb_start;
5034         else
5035                 engine->emit_bb_start = gen8_emit_bb_start_noarb;
5036 }
5037
5038 static void execlists_shutdown(struct intel_engine_cs *engine)
5039 {
5040         /* Synchronise with residual timers and any softirq they raise */
5041         del_timer_sync(&engine->execlists.timer);
5042         del_timer_sync(&engine->execlists.preempt);
5043         tasklet_kill(&engine->execlists.tasklet);
5044 }
5045
5046 static void execlists_release(struct intel_engine_cs *engine)
5047 {
5048         engine->sanitize = NULL; /* no longer in control, nothing to sanitize */
5049
5050         execlists_shutdown(engine);
5051
5052         intel_engine_cleanup_common(engine);
5053         lrc_destroy_wa_ctx(engine);
5054 }
5055
5056 static void
5057 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
5058 {
5059         /* Default vfuncs which can be overriden by each engine. */
5060
5061         engine->resume = execlists_resume;
5062
5063         engine->cops = &execlists_context_ops;
5064         engine->request_alloc = execlists_request_alloc;
5065
5066         engine->emit_flush = gen8_emit_flush;
5067         engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
5068         engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
5069         if (INTEL_GEN(engine->i915) >= 12) {
5070                 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
5071                 engine->emit_flush = gen12_emit_flush;
5072         }
5073         engine->set_default_submission = intel_execlists_set_default_submission;
5074
5075         if (INTEL_GEN(engine->i915) < 11) {
5076                 engine->irq_enable = gen8_logical_ring_enable_irq;
5077                 engine->irq_disable = gen8_logical_ring_disable_irq;
5078         } else {
5079                 /*
5080                  * TODO: On Gen11 interrupt masks need to be clear
5081                  * to allow C6 entry. Keep interrupts enabled at
5082                  * and take the hit of generating extra interrupts
5083                  * until a more refined solution exists.
5084                  */
5085         }
5086 }
5087
5088 static inline void
5089 logical_ring_default_irqs(struct intel_engine_cs *engine)
5090 {
5091         unsigned int shift = 0;
5092
5093         if (INTEL_GEN(engine->i915) < 11) {
5094                 const u8 irq_shifts[] = {
5095                         [RCS0]  = GEN8_RCS_IRQ_SHIFT,
5096                         [BCS0]  = GEN8_BCS_IRQ_SHIFT,
5097                         [VCS0]  = GEN8_VCS0_IRQ_SHIFT,
5098                         [VCS1]  = GEN8_VCS1_IRQ_SHIFT,
5099                         [VECS0] = GEN8_VECS_IRQ_SHIFT,
5100                 };
5101
5102                 shift = irq_shifts[engine->id];
5103         }
5104
5105         engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
5106         engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
5107         engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
5108         engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift;
5109 }
5110
5111 static void rcs_submission_override(struct intel_engine_cs *engine)
5112 {
5113         switch (INTEL_GEN(engine->i915)) {
5114         case 12:
5115                 engine->emit_flush = gen12_emit_flush_render;
5116                 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
5117                 break;
5118         case 11:
5119                 engine->emit_flush = gen11_emit_flush_render;
5120                 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
5121                 break;
5122         default:
5123                 engine->emit_flush = gen8_emit_flush_render;
5124                 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
5125                 break;
5126         }
5127 }
5128
5129 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
5130 {
5131         struct intel_engine_execlists * const execlists = &engine->execlists;
5132         struct drm_i915_private *i915 = engine->i915;
5133         struct intel_uncore *uncore = engine->uncore;
5134         u32 base = engine->mmio_base;
5135
5136         tasklet_init(&engine->execlists.tasklet,
5137                      execlists_submission_tasklet, (unsigned long)engine);
5138         timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
5139         timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
5140
5141         logical_ring_default_vfuncs(engine);
5142         logical_ring_default_irqs(engine);
5143
5144         if (engine->class == RENDER_CLASS)
5145                 rcs_submission_override(engine);
5146
5147         if (intel_init_workaround_bb(engine))
5148                 /*
5149                  * We continue even if we fail to initialize WA batch
5150                  * because we only expect rare glitches but nothing
5151                  * critical to prevent us from using GPU
5152                  */
5153                 drm_err(&i915->drm, "WA batch buffer initialization failed\n");
5154
5155         if (HAS_LOGICAL_RING_ELSQ(i915)) {
5156                 execlists->submit_reg = uncore->regs +
5157                         i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
5158                 execlists->ctrl_reg = uncore->regs +
5159                         i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
5160         } else {
5161                 execlists->submit_reg = uncore->regs +
5162                         i915_mmio_reg_offset(RING_ELSP(base));
5163         }
5164
5165         execlists->csb_status =
5166                 &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
5167
5168         execlists->csb_write =
5169                 &engine->status_page.addr[intel_hws_csb_write_index(i915)];
5170
5171         if (INTEL_GEN(i915) < 11)
5172                 execlists->csb_size = GEN8_CSB_ENTRIES;
5173         else
5174                 execlists->csb_size = GEN11_CSB_ENTRIES;
5175
5176         if (INTEL_GEN(engine->i915) >= 11) {
5177                 execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32);
5178                 execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32);
5179         }
5180
5181         /* Finally, take ownership and responsibility for cleanup! */
5182         engine->sanitize = execlists_sanitize;
5183         engine->release = execlists_release;
5184
5185         return 0;
5186 }
5187
5188 static void init_common_reg_state(u32 * const regs,
5189                                   const struct intel_engine_cs *engine,
5190                                   const struct intel_ring *ring,
5191                                   bool inhibit)
5192 {
5193         u32 ctl;
5194
5195         ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
5196         ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
5197         if (inhibit)
5198                 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
5199         if (INTEL_GEN(engine->i915) < 11)
5200                 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
5201                                            CTX_CTRL_RS_CTX_ENABLE);
5202         regs[CTX_CONTEXT_CONTROL] = ctl;
5203
5204         regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
5205         regs[CTX_TIMESTAMP] = 0;
5206 }
5207
5208 static void init_wa_bb_reg_state(u32 * const regs,
5209                                  const struct intel_engine_cs *engine)
5210 {
5211         const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
5212
5213         if (wa_ctx->per_ctx.size) {
5214                 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
5215
5216                 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
5217                 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
5218                         (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
5219         }
5220
5221         if (wa_ctx->indirect_ctx.size) {
5222                 lrc_ring_setup_indirect_ctx(regs, engine,
5223                                             i915_ggtt_offset(wa_ctx->vma) +
5224                                             wa_ctx->indirect_ctx.offset,
5225                                             wa_ctx->indirect_ctx.size);
5226         }
5227 }
5228
5229 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
5230 {
5231         if (i915_vm_is_4lvl(&ppgtt->vm)) {
5232                 /* 64b PPGTT (48bit canonical)
5233                  * PDP0_DESCRIPTOR contains the base address to PML4 and
5234                  * other PDP Descriptors are ignored.
5235                  */
5236                 ASSIGN_CTX_PML4(ppgtt, regs);
5237         } else {
5238                 ASSIGN_CTX_PDP(ppgtt, regs, 3);
5239                 ASSIGN_CTX_PDP(ppgtt, regs, 2);
5240                 ASSIGN_CTX_PDP(ppgtt, regs, 1);
5241                 ASSIGN_CTX_PDP(ppgtt, regs, 0);
5242         }
5243 }
5244
5245 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
5246 {
5247         if (i915_is_ggtt(vm))
5248                 return i915_vm_to_ggtt(vm)->alias;
5249         else
5250                 return i915_vm_to_ppgtt(vm);
5251 }
5252
5253 static void execlists_init_reg_state(u32 *regs,
5254                                      const struct intel_context *ce,
5255                                      const struct intel_engine_cs *engine,
5256                                      const struct intel_ring *ring,
5257                                      bool inhibit)
5258 {
5259         /*
5260          * A context is actually a big batch buffer with several
5261          * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
5262          * values we are setting here are only for the first context restore:
5263          * on a subsequent save, the GPU will recreate this batchbuffer with new
5264          * values (including all the missing MI_LOAD_REGISTER_IMM commands that
5265          * we are not initializing here).
5266          *
5267          * Must keep consistent with virtual_update_register_offsets().
5268          */
5269         set_offsets(regs, reg_offsets(engine), engine, inhibit);
5270
5271         init_common_reg_state(regs, engine, ring, inhibit);
5272         init_ppgtt_reg_state(regs, vm_alias(ce->vm));
5273
5274         init_wa_bb_reg_state(regs, engine);
5275
5276         __reset_stop_ring(regs, engine);
5277 }
5278
5279 static int
5280 populate_lr_context(struct intel_context *ce,
5281                     struct drm_i915_gem_object *ctx_obj,
5282                     struct intel_engine_cs *engine,
5283                     struct intel_ring *ring)
5284 {
5285         bool inhibit = true;
5286         void *vaddr;
5287
5288         vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
5289         if (IS_ERR(vaddr)) {
5290                 drm_dbg(&engine->i915->drm, "Could not map object pages!\n");
5291                 return PTR_ERR(vaddr);
5292         }
5293
5294         set_redzone(vaddr, engine);
5295
5296         if (engine->default_state) {
5297                 shmem_read(engine->default_state, 0,
5298                            vaddr, engine->context_size);
5299                 __set_bit(CONTEXT_VALID_BIT, &ce->flags);
5300                 inhibit = false;
5301         }
5302
5303         /* Clear the ppHWSP (inc. per-context counters) */
5304         memset(vaddr, 0, PAGE_SIZE);
5305
5306         /*
5307          * The second page of the context object contains some registers which
5308          * must be set up prior to the first execution.
5309          */
5310         execlists_init_reg_state(vaddr + LRC_STATE_OFFSET,
5311                                  ce, engine, ring, inhibit);
5312
5313         __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
5314         i915_gem_object_unpin_map(ctx_obj);
5315         return 0;
5316 }
5317
5318 static int __execlists_context_alloc(struct intel_context *ce,
5319                                      struct intel_engine_cs *engine)
5320 {
5321         struct drm_i915_gem_object *ctx_obj;
5322         struct intel_ring *ring;
5323         struct i915_vma *vma;
5324         u32 context_size;
5325         int ret;
5326
5327         GEM_BUG_ON(ce->state);
5328         context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
5329
5330         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
5331                 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
5332
5333         if (INTEL_GEN(engine->i915) == 12) {
5334                 ce->wa_bb_page = context_size / PAGE_SIZE;
5335                 context_size += PAGE_SIZE;
5336         }
5337
5338         ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
5339         if (IS_ERR(ctx_obj))
5340                 return PTR_ERR(ctx_obj);
5341
5342         vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
5343         if (IS_ERR(vma)) {
5344                 ret = PTR_ERR(vma);
5345                 goto error_deref_obj;
5346         }
5347
5348         if (!ce->timeline) {
5349                 struct intel_timeline *tl;
5350                 struct i915_vma *hwsp;
5351
5352                 /*
5353                  * Use the static global HWSP for the kernel context, and
5354                  * a dynamically allocated cacheline for everyone else.
5355                  */
5356                 hwsp = NULL;
5357                 if (unlikely(intel_context_is_barrier(ce)))
5358                         hwsp = engine->status_page.vma;
5359
5360                 tl = intel_timeline_create(engine->gt, hwsp);
5361                 if (IS_ERR(tl)) {
5362                         ret = PTR_ERR(tl);
5363                         goto error_deref_obj;
5364                 }
5365
5366                 ce->timeline = tl;
5367         }
5368
5369         ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
5370         if (IS_ERR(ring)) {
5371                 ret = PTR_ERR(ring);
5372                 goto error_deref_obj;
5373         }
5374
5375         ret = populate_lr_context(ce, ctx_obj, engine, ring);
5376         if (ret) {
5377                 drm_dbg(&engine->i915->drm,
5378                         "Failed to populate LRC: %d\n", ret);
5379                 goto error_ring_free;
5380         }
5381
5382         ce->ring = ring;
5383         ce->state = vma;
5384
5385         return 0;
5386
5387 error_ring_free:
5388         intel_ring_put(ring);
5389 error_deref_obj:
5390         i915_gem_object_put(ctx_obj);
5391         return ret;
5392 }
5393
5394 static struct list_head *virtual_queue(struct virtual_engine *ve)
5395 {
5396         return &ve->base.execlists.default_priolist.requests[0];
5397 }
5398
5399 static void virtual_context_destroy(struct kref *kref)
5400 {
5401         struct virtual_engine *ve =
5402                 container_of(kref, typeof(*ve), context.ref);
5403         unsigned int n;
5404
5405         GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5406         GEM_BUG_ON(ve->request);
5407         GEM_BUG_ON(ve->context.inflight);
5408
5409         for (n = 0; n < ve->num_siblings; n++) {
5410                 struct intel_engine_cs *sibling = ve->siblings[n];
5411                 struct rb_node *node = &ve->nodes[sibling->id].rb;
5412                 unsigned long flags;
5413
5414                 if (RB_EMPTY_NODE(node))
5415                         continue;
5416
5417                 spin_lock_irqsave(&sibling->active.lock, flags);
5418
5419                 /* Detachment is lazily performed in the execlists tasklet */
5420                 if (!RB_EMPTY_NODE(node))
5421                         rb_erase_cached(node, &sibling->execlists.virtual);
5422
5423                 spin_unlock_irqrestore(&sibling->active.lock, flags);
5424         }
5425         GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
5426
5427         if (ve->context.state)
5428                 __execlists_context_fini(&ve->context);
5429         intel_context_fini(&ve->context);
5430
5431         intel_engine_free_request_pool(&ve->base);
5432
5433         kfree(ve->bonds);
5434         kfree(ve);
5435 }
5436
5437 static void virtual_engine_initial_hint(struct virtual_engine *ve)
5438 {
5439         int swp;
5440
5441         /*
5442          * Pick a random sibling on starting to help spread the load around.
5443          *
5444          * New contexts are typically created with exactly the same order
5445          * of siblings, and often started in batches. Due to the way we iterate
5446          * the array of sibling when submitting requests, sibling[0] is
5447          * prioritised for dequeuing. If we make sure that sibling[0] is fairly
5448          * randomised across the system, we also help spread the load by the
5449          * first engine we inspect being different each time.
5450          *
5451          * NB This does not force us to execute on this engine, it will just
5452          * typically be the first we inspect for submission.
5453          */
5454         swp = prandom_u32_max(ve->num_siblings);
5455         if (swp)
5456                 swap(ve->siblings[swp], ve->siblings[0]);
5457 }
5458
5459 static int virtual_context_alloc(struct intel_context *ce)
5460 {
5461         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5462
5463         return __execlists_context_alloc(ce, ve->siblings[0]);
5464 }
5465
5466 static int virtual_context_pin(struct intel_context *ce)
5467 {
5468         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5469
5470         /* Note: we must use a real engine class for setting up reg state */
5471         return __execlists_context_pin(ce, ve->siblings[0]);
5472 }
5473
5474 static void virtual_context_enter(struct intel_context *ce)
5475 {
5476         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5477         unsigned int n;
5478
5479         for (n = 0; n < ve->num_siblings; n++)
5480                 intel_engine_pm_get(ve->siblings[n]);
5481
5482         intel_timeline_enter(ce->timeline);
5483 }
5484
5485 static void virtual_context_exit(struct intel_context *ce)
5486 {
5487         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5488         unsigned int n;
5489
5490         intel_timeline_exit(ce->timeline);
5491
5492         for (n = 0; n < ve->num_siblings; n++)
5493                 intel_engine_pm_put(ve->siblings[n]);
5494 }
5495
5496 static const struct intel_context_ops virtual_context_ops = {
5497         .alloc = virtual_context_alloc,
5498
5499         .pin = virtual_context_pin,
5500         .unpin = execlists_context_unpin,
5501
5502         .enter = virtual_context_enter,
5503         .exit = virtual_context_exit,
5504
5505         .destroy = virtual_context_destroy,
5506 };
5507
5508 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
5509 {
5510         struct i915_request *rq;
5511         intel_engine_mask_t mask;
5512
5513         rq = READ_ONCE(ve->request);
5514         if (!rq)
5515                 return 0;
5516
5517         /* The rq is ready for submission; rq->execution_mask is now stable. */
5518         mask = rq->execution_mask;
5519         if (unlikely(!mask)) {
5520                 /* Invalid selection, submit to a random engine in error */
5521                 i915_request_set_error_once(rq, -ENODEV);
5522                 mask = ve->siblings[0]->mask;
5523         }
5524
5525         ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
5526                      rq->fence.context, rq->fence.seqno,
5527                      mask, ve->base.execlists.queue_priority_hint);
5528
5529         return mask;
5530 }
5531
5532 static void virtual_submission_tasklet(unsigned long data)
5533 {
5534         struct virtual_engine * const ve = (struct virtual_engine *)data;
5535         const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
5536         intel_engine_mask_t mask;
5537         unsigned int n;
5538
5539         rcu_read_lock();
5540         mask = virtual_submission_mask(ve);
5541         rcu_read_unlock();
5542         if (unlikely(!mask))
5543                 return;
5544
5545         local_irq_disable();
5546         for (n = 0; n < ve->num_siblings; n++) {
5547                 struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]);
5548                 struct ve_node * const node = &ve->nodes[sibling->id];
5549                 struct rb_node **parent, *rb;
5550                 bool first;
5551
5552                 if (!READ_ONCE(ve->request))
5553                         break; /* already handled by a sibling's tasklet */
5554
5555                 if (unlikely(!(mask & sibling->mask))) {
5556                         if (!RB_EMPTY_NODE(&node->rb)) {
5557                                 spin_lock(&sibling->active.lock);
5558                                 rb_erase_cached(&node->rb,
5559                                                 &sibling->execlists.virtual);
5560                                 RB_CLEAR_NODE(&node->rb);
5561                                 spin_unlock(&sibling->active.lock);
5562                         }
5563                         continue;
5564                 }
5565
5566                 spin_lock(&sibling->active.lock);
5567
5568                 if (!RB_EMPTY_NODE(&node->rb)) {
5569                         /*
5570                          * Cheat and avoid rebalancing the tree if we can
5571                          * reuse this node in situ.
5572                          */
5573                         first = rb_first_cached(&sibling->execlists.virtual) ==
5574                                 &node->rb;
5575                         if (prio == node->prio || (prio > node->prio && first))
5576                                 goto submit_engine;
5577
5578                         rb_erase_cached(&node->rb, &sibling->execlists.virtual);
5579                 }
5580
5581                 rb = NULL;
5582                 first = true;
5583                 parent = &sibling->execlists.virtual.rb_root.rb_node;
5584                 while (*parent) {
5585                         struct ve_node *other;
5586
5587                         rb = *parent;
5588                         other = rb_entry(rb, typeof(*other), rb);
5589                         if (prio > other->prio) {
5590                                 parent = &rb->rb_left;
5591                         } else {
5592                                 parent = &rb->rb_right;
5593                                 first = false;
5594                         }
5595                 }
5596
5597                 rb_link_node(&node->rb, rb, parent);
5598                 rb_insert_color_cached(&node->rb,
5599                                        &sibling->execlists.virtual,
5600                                        first);
5601
5602 submit_engine:
5603                 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
5604                 node->prio = prio;
5605                 if (first && prio > sibling->execlists.queue_priority_hint)
5606                         tasklet_hi_schedule(&sibling->execlists.tasklet);
5607
5608                 spin_unlock(&sibling->active.lock);
5609         }
5610         local_irq_enable();
5611 }
5612
5613 static void virtual_submit_request(struct i915_request *rq)
5614 {
5615         struct virtual_engine *ve = to_virtual_engine(rq->engine);
5616         struct i915_request *old;
5617         unsigned long flags;
5618
5619         ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
5620                      rq->fence.context,
5621                      rq->fence.seqno);
5622
5623         GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
5624
5625         spin_lock_irqsave(&ve->base.active.lock, flags);
5626
5627         old = ve->request;
5628         if (old) { /* background completion event from preempt-to-busy */
5629                 GEM_BUG_ON(!i915_request_completed(old));
5630                 __i915_request_submit(old);
5631                 i915_request_put(old);
5632         }
5633
5634         if (i915_request_completed(rq)) {
5635                 __i915_request_submit(rq);
5636
5637                 ve->base.execlists.queue_priority_hint = INT_MIN;
5638                 ve->request = NULL;
5639         } else {
5640                 ve->base.execlists.queue_priority_hint = rq_prio(rq);
5641                 ve->request = i915_request_get(rq);
5642
5643                 GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5644                 list_move_tail(&rq->sched.link, virtual_queue(ve));
5645
5646                 tasklet_hi_schedule(&ve->base.execlists.tasklet);
5647         }
5648
5649         spin_unlock_irqrestore(&ve->base.active.lock, flags);
5650 }
5651
5652 static struct ve_bond *
5653 virtual_find_bond(struct virtual_engine *ve,
5654                   const struct intel_engine_cs *master)
5655 {
5656         int i;
5657
5658         for (i = 0; i < ve->num_bonds; i++) {
5659                 if (ve->bonds[i].master == master)
5660                         return &ve->bonds[i];
5661         }
5662
5663         return NULL;
5664 }
5665
5666 static void
5667 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
5668 {
5669         struct virtual_engine *ve = to_virtual_engine(rq->engine);
5670         intel_engine_mask_t allowed, exec;
5671         struct ve_bond *bond;
5672
5673         allowed = ~to_request(signal)->engine->mask;
5674
5675         bond = virtual_find_bond(ve, to_request(signal)->engine);
5676         if (bond)
5677                 allowed &= bond->sibling_mask;
5678
5679         /* Restrict the bonded request to run on only the available engines */
5680         exec = READ_ONCE(rq->execution_mask);
5681         while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
5682                 ;
5683
5684         /* Prevent the master from being re-run on the bonded engines */
5685         to_request(signal)->execution_mask &= ~allowed;
5686 }
5687
5688 struct intel_context *
5689 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
5690                                unsigned int count)
5691 {
5692         struct virtual_engine *ve;
5693         unsigned int n;
5694         int err;
5695
5696         if (count == 0)
5697                 return ERR_PTR(-EINVAL);
5698
5699         if (count == 1)
5700                 return intel_context_create(siblings[0]);
5701
5702         ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
5703         if (!ve)
5704                 return ERR_PTR(-ENOMEM);
5705
5706         ve->base.i915 = siblings[0]->i915;
5707         ve->base.gt = siblings[0]->gt;
5708         ve->base.uncore = siblings[0]->uncore;
5709         ve->base.id = -1;
5710
5711         ve->base.class = OTHER_CLASS;
5712         ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
5713         ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5714         ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5715
5716         /*
5717          * The decision on whether to submit a request using semaphores
5718          * depends on the saturated state of the engine. We only compute
5719          * this during HW submission of the request, and we need for this
5720          * state to be globally applied to all requests being submitted
5721          * to this engine. Virtual engines encompass more than one physical
5722          * engine and so we cannot accurately tell in advance if one of those
5723          * engines is already saturated and so cannot afford to use a semaphore
5724          * and be pessimized in priority for doing so -- if we are the only
5725          * context using semaphores after all other clients have stopped, we
5726          * will be starved on the saturated system. Such a global switch for
5727          * semaphores is less than ideal, but alas is the current compromise.
5728          */
5729         ve->base.saturated = ALL_ENGINES;
5730
5731         snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
5732
5733         intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
5734         intel_engine_init_breadcrumbs(&ve->base);
5735         intel_engine_init_execlists(&ve->base);
5736         ve->base.breadcrumbs.irq_armed = true; /* fake HW, used for irq_work */
5737
5738         ve->base.cops = &virtual_context_ops;
5739         ve->base.request_alloc = execlists_request_alloc;
5740
5741         ve->base.schedule = i915_schedule;
5742         ve->base.submit_request = virtual_submit_request;
5743         ve->base.bond_execute = virtual_bond_execute;
5744
5745         INIT_LIST_HEAD(virtual_queue(ve));
5746         ve->base.execlists.queue_priority_hint = INT_MIN;
5747         tasklet_init(&ve->base.execlists.tasklet,
5748                      virtual_submission_tasklet,
5749                      (unsigned long)ve);
5750
5751         intel_context_init(&ve->context, &ve->base);
5752
5753         for (n = 0; n < count; n++) {
5754                 struct intel_engine_cs *sibling = siblings[n];
5755
5756                 GEM_BUG_ON(!is_power_of_2(sibling->mask));
5757                 if (sibling->mask & ve->base.mask) {
5758                         DRM_DEBUG("duplicate %s entry in load balancer\n",
5759                                   sibling->name);
5760                         err = -EINVAL;
5761                         goto err_put;
5762                 }
5763
5764                 /*
5765                  * The virtual engine implementation is tightly coupled to
5766                  * the execlists backend -- we push out request directly
5767                  * into a tree inside each physical engine. We could support
5768                  * layering if we handle cloning of the requests and
5769                  * submitting a copy into each backend.
5770                  */
5771                 if (sibling->execlists.tasklet.func !=
5772                     execlists_submission_tasklet) {
5773                         err = -ENODEV;
5774                         goto err_put;
5775                 }
5776
5777                 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5778                 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5779
5780                 ve->siblings[ve->num_siblings++] = sibling;
5781                 ve->base.mask |= sibling->mask;
5782
5783                 /*
5784                  * All physical engines must be compatible for their emission
5785                  * functions (as we build the instructions during request
5786                  * construction and do not alter them before submission
5787                  * on the physical engine). We use the engine class as a guide
5788                  * here, although that could be refined.
5789                  */
5790                 if (ve->base.class != OTHER_CLASS) {
5791                         if (ve->base.class != sibling->class) {
5792                                 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5793                                           sibling->class, ve->base.class);
5794                                 err = -EINVAL;
5795                                 goto err_put;
5796                         }
5797                         continue;
5798                 }
5799
5800                 ve->base.class = sibling->class;
5801                 ve->base.uabi_class = sibling->uabi_class;
5802                 snprintf(ve->base.name, sizeof(ve->base.name),
5803                          "v%dx%d", ve->base.class, count);
5804                 ve->base.context_size = sibling->context_size;
5805
5806                 ve->base.emit_bb_start = sibling->emit_bb_start;
5807                 ve->base.emit_flush = sibling->emit_flush;
5808                 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5809                 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5810                 ve->base.emit_fini_breadcrumb_dw =
5811                         sibling->emit_fini_breadcrumb_dw;
5812
5813                 ve->base.flags = sibling->flags;
5814         }
5815
5816         ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5817
5818         virtual_engine_initial_hint(ve);
5819         return &ve->context;
5820
5821 err_put:
5822         intel_context_put(&ve->context);
5823         return ERR_PTR(err);
5824 }
5825
5826 struct intel_context *
5827 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5828 {
5829         struct virtual_engine *se = to_virtual_engine(src);
5830         struct intel_context *dst;
5831
5832         dst = intel_execlists_create_virtual(se->siblings,
5833                                              se->num_siblings);
5834         if (IS_ERR(dst))
5835                 return dst;
5836
5837         if (se->num_bonds) {
5838                 struct virtual_engine *de = to_virtual_engine(dst->engine);
5839
5840                 de->bonds = kmemdup(se->bonds,
5841                                     sizeof(*se->bonds) * se->num_bonds,
5842                                     GFP_KERNEL);
5843                 if (!de->bonds) {
5844                         intel_context_put(dst);
5845                         return ERR_PTR(-ENOMEM);
5846                 }
5847
5848                 de->num_bonds = se->num_bonds;
5849         }
5850
5851         return dst;
5852 }
5853
5854 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5855                                      const struct intel_engine_cs *master,
5856                                      const struct intel_engine_cs *sibling)
5857 {
5858         struct virtual_engine *ve = to_virtual_engine(engine);
5859         struct ve_bond *bond;
5860         int n;
5861
5862         /* Sanity check the sibling is part of the virtual engine */
5863         for (n = 0; n < ve->num_siblings; n++)
5864                 if (sibling == ve->siblings[n])
5865                         break;
5866         if (n == ve->num_siblings)
5867                 return -EINVAL;
5868
5869         bond = virtual_find_bond(ve, master);
5870         if (bond) {
5871                 bond->sibling_mask |= sibling->mask;
5872                 return 0;
5873         }
5874
5875         bond = krealloc(ve->bonds,
5876                         sizeof(*bond) * (ve->num_bonds + 1),
5877                         GFP_KERNEL);
5878         if (!bond)
5879                 return -ENOMEM;
5880
5881         bond[ve->num_bonds].master = master;
5882         bond[ve->num_bonds].sibling_mask = sibling->mask;
5883
5884         ve->bonds = bond;
5885         ve->num_bonds++;
5886
5887         return 0;
5888 }
5889
5890 struct intel_engine_cs *
5891 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
5892                                  unsigned int sibling)
5893 {
5894         struct virtual_engine *ve = to_virtual_engine(engine);
5895
5896         if (sibling >= ve->num_siblings)
5897                 return NULL;
5898
5899         return ve->siblings[sibling];
5900 }
5901
5902 void intel_execlists_show_requests(struct intel_engine_cs *engine,
5903                                    struct drm_printer *m,
5904                                    void (*show_request)(struct drm_printer *m,
5905                                                         struct i915_request *rq,
5906                                                         const char *prefix),
5907                                    unsigned int max)
5908 {
5909         const struct intel_engine_execlists *execlists = &engine->execlists;
5910         struct i915_request *rq, *last;
5911         unsigned long flags;
5912         unsigned int count;
5913         struct rb_node *rb;
5914
5915         spin_lock_irqsave(&engine->active.lock, flags);
5916
5917         last = NULL;
5918         count = 0;
5919         list_for_each_entry(rq, &engine->active.requests, sched.link) {
5920                 if (count++ < max - 1)
5921                         show_request(m, rq, "\t\tE ");
5922                 else
5923                         last = rq;
5924         }
5925         if (last) {
5926                 if (count > max) {
5927                         drm_printf(m,
5928                                    "\t\t...skipping %d executing requests...\n",
5929                                    count - max);
5930                 }
5931                 show_request(m, last, "\t\tE ");
5932         }
5933
5934         if (execlists->switch_priority_hint != INT_MIN)
5935                 drm_printf(m, "\t\tSwitch priority hint: %d\n",
5936                            READ_ONCE(execlists->switch_priority_hint));
5937         if (execlists->queue_priority_hint != INT_MIN)
5938                 drm_printf(m, "\t\tQueue priority hint: %d\n",
5939                            READ_ONCE(execlists->queue_priority_hint));
5940
5941         last = NULL;
5942         count = 0;
5943         for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
5944                 struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
5945                 int i;
5946
5947                 priolist_for_each_request(rq, p, i) {
5948                         if (count++ < max - 1)
5949                                 show_request(m, rq, "\t\tQ ");
5950                         else
5951                                 last = rq;
5952                 }
5953         }
5954         if (last) {
5955                 if (count > max) {
5956                         drm_printf(m,
5957                                    "\t\t...skipping %d queued requests...\n",
5958                                    count - max);
5959                 }
5960                 show_request(m, last, "\t\tQ ");
5961         }
5962
5963         last = NULL;
5964         count = 0;
5965         for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
5966                 struct virtual_engine *ve =
5967                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
5968                 struct i915_request *rq = READ_ONCE(ve->request);
5969
5970                 if (rq) {
5971                         if (count++ < max - 1)
5972                                 show_request(m, rq, "\t\tV ");
5973                         else
5974                                 last = rq;
5975                 }
5976         }
5977         if (last) {
5978                 if (count > max) {
5979                         drm_printf(m,
5980                                    "\t\t...skipping %d virtual requests...\n",
5981                                    count - max);
5982                 }
5983                 show_request(m, last, "\t\tV ");
5984         }
5985
5986         spin_unlock_irqrestore(&engine->active.lock, flags);
5987 }
5988
5989 void intel_lr_context_reset(struct intel_engine_cs *engine,
5990                             struct intel_context *ce,
5991                             u32 head,
5992                             bool scrub)
5993 {
5994         GEM_BUG_ON(!intel_context_is_pinned(ce));
5995
5996         /*
5997          * We want a simple context + ring to execute the breadcrumb update.
5998          * We cannot rely on the context being intact across the GPU hang,
5999          * so clear it and rebuild just what we need for the breadcrumb.
6000          * All pending requests for this context will be zapped, and any
6001          * future request will be after userspace has had the opportunity
6002          * to recreate its own state.
6003          */
6004         if (scrub)
6005                 restore_default_state(ce, engine);
6006
6007         /* Rerun the request; its payload has been neutered (if guilty). */
6008         __execlists_update_reg_state(ce, engine, head);
6009 }
6010
6011 bool
6012 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
6013 {
6014         return engine->set_default_submission ==
6015                intel_execlists_set_default_submission;
6016 }
6017
6018 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
6019 #include "selftest_lrc.c"
6020 #endif