drivers/gpu/drm/i915/gt/intel_lrc.c

   1 /*
   2  * Copyright © 2014 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Ben Widawsky <ben@bwidawsk.net>
  25  *    Michel Thierry <michel.thierry@intel.com>
  26  *    Thomas Daniel <thomas.daniel@intel.com>
  27  *    Oscar Mateo <oscar.mateo@intel.com>
  28  *
  29  */
  30
  31 /**
  32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
  33  *
  34  * Motivation:
  35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
  36  * These expanded contexts enable a number of new abilities, especially
  37  * "Execlists" (also implemented in this file).
  38  *
  39  * One of the main differences with the legacy HW contexts is that logical
  40  * ring contexts incorporate many more things to the context's state, like
  41  * PDPs or ringbuffer control registers:
  42  *
  43  * The reason why PDPs are included in the context is straightforward: as
  44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
  45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
  46  * instead, the GPU will do it for you on the context switch.
  47  *
  48  * But, what about the ringbuffer control registers (head, tail, etc..)?
  49  * shouldn't we just need a set of those per engine command streamer? This is
  50  * where the name "Logical Rings" starts to make sense: by virtualizing the
  51  * rings, the engine cs shifts to a new "ring buffer" with every context
  52  * switch. When you want to submit a workload to the GPU you: A) choose your
  53  * context, B) find its appropriate virtualized ring, C) write commands to it
  54  * and then, finally, D) tell the GPU to switch to that context.
  55  *
  56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
  57  * to a contexts is via a context execution list, ergo "Execlists".
  58  *
  59  * LRC implementation:
  60  * Regarding the creation of contexts, we have:
  61  *
  62  * - One global default context.
  63  * - One local default context for each opened fd.
  64  * - One local extra context for each context create ioctl call.
  65  *
  66  * Now that ringbuffers belong per-context (and not per-engine, like before)
  67  * and that contexts are uniquely tied to a given engine (and not reusable,
  68  * like before) we need:
  69  *
  70  * - One ringbuffer per-engine inside each context.
  71  * - One backing object per-engine inside each context.
  72  *
  73  * The global default context starts its life with these new objects fully
  74  * allocated and populated. The local default context for each opened fd is
  75  * more complex, because we don't know at creation time which engine is going
  76  * to use them. To handle this, we have implemented a deferred creation of LR
  77  * contexts:
  78  *
  79  * The local context starts its life as a hollow or blank holder, that only
  80  * gets populated for a given engine once we receive an execbuffer. If later
  81  * on we receive another execbuffer ioctl for the same context but a different
  82  * engine, we allocate/populate a new ringbuffer and context backing object and
  83  * so on.
  84  *
  85  * Finally, regarding local contexts created using the ioctl call: as they are
  86  * only allowed with the render ring, we can allocate & populate them right
  87  * away (no need to defer anything, at least for now).
  88  *
  89  * Execlists implementation:
  90  * Execlists are the new method by which, on gen8+ hardware, workloads are
  91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
  92  * This method works as follows:
  93  *
  94  * When a request is committed, its commands (the BB start and any leading or
  95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
  96  * for the appropriate context. The tail pointer in the hardware context is not
  97  * updated at this time, but instead, kept by the driver in the ringbuffer
  98  * structure. A structure representing this request is added to a request queue
  99  * for the appropriate engine: this structure contains a copy of the context's
 100  * tail after the request was written to the ring buffer and a pointer to the
 101  * context itself.
 102  *
 103  * If the engine's request queue was empty before the request was added, the
 104  * queue is processed immediately. Otherwise the queue will be processed during
 105  * a context switch interrupt. In any case, elements on the queue will get sent
 106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
 107  * globally unique 20-bits submission ID.
 108  *
 109  * When execution of a request completes, the GPU updates the context status
 110  * buffer with a context complete event and generates a context switch interrupt.
 111  * During the interrupt handling, the driver examines the events in the buffer:
 112  * for each context complete event, if the announced ID matches that on the head
 113  * of the request queue, then that request is retired and removed from the queue.
 114  *
 115  * After processing, if any requests were retired and the queue is not empty
 116  * then a new execution list can be submitted. The two requests at the front of
 117  * the queue are next to be submitted but since a context may not occur twice in
 118  * an execution list, if subsequent requests have the same ID as the first then
 119  * the two requests must be combined. This is done simply by discarding requests
 120  * at the head of the queue until either only one requests is left (in which case
 121  * we use a NULL second context) or the first two requests have unique IDs.
 122  *
 123  * By always executing the first two requests in the queue the driver ensures
 124  * that the GPU is kept as busy as possible. In the case where a single context
 125  * completes but a second context is still executing, the request for this second
 126  * context will be at the head of the queue when we remove the first one. This
 127  * request will then be resubmitted along with a new request for a different context,
 128  * which will cause the hardware to continue executing the second request and queue
 129  * the new request (the GPU detects the condition of a context getting preempted
 130  * with the same context and optimizes the context switch flow by not doing
 131  * preemption, but just sampling the new tail pointer).
 132  *
 133  */
 134 #include <linux/interrupt.h>
 135
 136 #include "i915_drv.h"
 137 #include "i915_perf.h"
 138 #include "i915_trace.h"
 139 #include "i915_vgpu.h"
 140 #include "intel_context.h"
 141 #include "intel_engine_pm.h"
 142 #include "intel_gt.h"
 143 #include "intel_gt_pm.h"
 144 #include "intel_gt_requests.h"
 145 #include "intel_lrc_reg.h"
 146 #include "intel_mocs.h"
 147 #include "intel_reset.h"
 148 #include "intel_ring.h"
 149 #include "intel_workarounds.h"
 150
 151 #define RING_EXECLIST_QFULL             (1 << 0x2)
 152 #define RING_EXECLIST1_VALID            (1 << 0x3)
 153 #define RING_EXECLIST0_VALID            (1 << 0x4)
 154 #define RING_EXECLIST_ACTIVE_STATUS     (3 << 0xE)
 155 #define RING_EXECLIST1_ACTIVE           (1 << 0x11)
 156 #define RING_EXECLIST0_ACTIVE           (1 << 0x12)
 157
 158 #define GEN8_CTX_STATUS_IDLE_ACTIVE     (1 << 0)
 159 #define GEN8_CTX_STATUS_PREEMPTED       (1 << 1)
 160 #define GEN8_CTX_STATUS_ELEMENT_SWITCH  (1 << 2)
 161 #define GEN8_CTX_STATUS_ACTIVE_IDLE     (1 << 3)
 162 #define GEN8_CTX_STATUS_COMPLETE        (1 << 4)
 163 #define GEN8_CTX_STATUS_LITE_RESTORE    (1 << 15)
 164
 165 #define GEN8_CTX_STATUS_COMPLETED_MASK \
 166          (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
 167
 168 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
 169
 170 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE  (0x1) /* lower csb dword */
 171 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */
 172 #define GEN12_CSB_SW_CTX_ID_MASK                GENMASK(25, 15)
 173 #define GEN12_IDLE_CTX_ID               0x7FF
 174 #define GEN12_CSB_CTX_VALID(csb_dw) \
 175         (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
 176
 177 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
 178 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
 179 #define WA_TAIL_DWORDS 2
 180 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
 181
 182 struct virtual_engine {
 183         struct intel_engine_cs base;
 184         struct intel_context context;
 185
 186         /*
 187          * We allow only a single request through the virtual engine at a time
 188          * (each request in the timeline waits for the completion fence of
 189          * the previous before being submitted). By restricting ourselves to
 190          * only submitting a single request, each request is placed on to a
 191          * physical to maximise load spreading (by virtue of the late greedy
 192          * scheduling -- each real engine takes the next available request
 193          * upon idling).
 194          */
 195         struct i915_request *request;
 196
 197         /*
 198          * We keep a rbtree of available virtual engines inside each physical
 199          * engine, sorted by priority. Here we preallocate the nodes we need
 200          * for the virtual engine, indexed by physical_engine->id.
 201          */
 202         struct ve_node {
 203                 struct rb_node rb;
 204                 int prio;
 205         } nodes[I915_NUM_ENGINES];
 206
 207         /*
 208          * Keep track of bonded pairs -- restrictions upon on our selection
 209          * of physical engines any particular request may be submitted to.
 210          * If we receive a submit-fence from a master engine, we will only
 211          * use one of sibling_mask physical engines.
 212          */
 213         struct ve_bond {
 214                 const struct intel_engine_cs *master;
 215                 intel_engine_mask_t sibling_mask;
 216         } *bonds;
 217         unsigned int num_bonds;
 218
 219         /* And finally, which physical engines this virtual engine maps onto. */
 220         unsigned int num_siblings;
 221         struct intel_engine_cs *siblings[0];
 222 };
 223
 224 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
 225 {
 226         GEM_BUG_ON(!intel_engine_is_virtual(engine));
 227         return container_of(engine, struct virtual_engine, base);
 228 }
 229
 230 static int __execlists_context_alloc(struct intel_context *ce,
 231                                      struct intel_engine_cs *engine);
 232
 233 static void execlists_init_reg_state(u32 *reg_state,
 234                                      const struct intel_context *ce,
 235                                      const struct intel_engine_cs *engine,
 236                                      const struct intel_ring *ring,
 237                                      bool close);
 238 static void
 239 __execlists_update_reg_state(const struct intel_context *ce,
 240                              const struct intel_engine_cs *engine);
 241
 242 static void mark_eio(struct i915_request *rq)
 243 {
 244         if (i915_request_completed(rq))
 245                 return;
 246
 247         GEM_BUG_ON(i915_request_signaled(rq));
 248
 249         dma_fence_set_error(&rq->fence, -EIO);
 250         i915_request_mark_complete(rq);
 251 }
 252
 253 static struct i915_request *
 254 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
 255 {
 256         struct i915_request *active = rq;
 257
 258         rcu_read_lock();
 259         list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
 260                 if (i915_request_completed(rq))
 261                         break;
 262
 263                 active = rq;
 264         }
 265         rcu_read_unlock();
 266
 267         return active;
 268 }
 269
 270 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
 271 {
 272         return (i915_ggtt_offset(engine->status_page.vma) +
 273                 I915_GEM_HWS_PREEMPT_ADDR);
 274 }
 275
 276 static inline void
 277 ring_set_paused(const struct intel_engine_cs *engine, int state)
 278 {
 279         /*
 280          * We inspect HWS_PREEMPT with a semaphore inside
 281          * engine->emit_fini_breadcrumb. If the dword is true,
 282          * the ring is paused as the semaphore will busywait
 283          * until the dword is false.
 284          */
 285         engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
 286         if (state)
 287                 wmb();
 288 }
 289
 290 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
 291 {
 292         return rb_entry(rb, struct i915_priolist, node);
 293 }
 294
 295 static inline int rq_prio(const struct i915_request *rq)
 296 {
 297         return rq->sched.attr.priority;
 298 }
 299
 300 static int effective_prio(const struct i915_request *rq)
 301 {
 302         int prio = rq_prio(rq);
 303
 304         /*
 305          * If this request is special and must not be interrupted at any
 306          * cost, so be it. Note we are only checking the most recent request
 307          * in the context and so may be masking an earlier vip request. It
 308          * is hoped that under the conditions where nopreempt is used, this
 309          * will not matter (i.e. all requests to that context will be
 310          * nopreempt for as long as desired).
 311          */
 312         if (i915_request_has_nopreempt(rq))
 313                 prio = I915_PRIORITY_UNPREEMPTABLE;
 314
 315         /*
 316          * On unwinding the active request, we give it a priority bump
 317          * if it has completed waiting on any semaphore. If we know that
 318          * the request has already started, we can prevent an unwanted
 319          * preempt-to-idle cycle by taking that into account now.
 320          */
 321         if (__i915_request_has_started(rq))
 322                 prio |= I915_PRIORITY_NOSEMAPHORE;
 323
 324         /* Restrict mere WAIT boosts from triggering preemption */
 325         BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
 326         return prio | __NO_PREEMPTION;
 327 }
 328
 329 static int queue_prio(const struct intel_engine_execlists *execlists)
 330 {
 331         struct i915_priolist *p;
 332         struct rb_node *rb;
 333
 334         rb = rb_first_cached(&execlists->queue);
 335         if (!rb)
 336                 return INT_MIN;
 337
 338         /*
 339          * As the priolist[] are inverted, with the highest priority in [0],
 340          * we have to flip the index value to become priority.
 341          */
 342         p = to_priolist(rb);
 343         return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
 344 }
 345
 346 static inline bool need_preempt(const struct intel_engine_cs *engine,
 347                                 const struct i915_request *rq,
 348                                 struct rb_node *rb)
 349 {
 350         int last_prio;
 351
 352         if (!intel_engine_has_semaphores(engine))
 353                 return false;
 354
 355         /*
 356          * Check if the current priority hint merits a preemption attempt.
 357          *
 358          * We record the highest value priority we saw during rescheduling
 359          * prior to this dequeue, therefore we know that if it is strictly
 360          * less than the current tail of ESLP[0], we do not need to force
 361          * a preempt-to-idle cycle.
 362          *
 363          * However, the priority hint is a mere hint that we may need to
 364          * preempt. If that hint is stale or we may be trying to preempt
 365          * ourselves, ignore the request.
 366          *
 367          * More naturally we would write
 368          *      prio >= max(0, last);
 369          * except that we wish to prevent triggering preemption at the same
 370          * priority level: the task that is running should remain running
 371          * to preserve FIFO ordering of dependencies.
 372          */
 373         last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
 374         if (engine->execlists.queue_priority_hint <= last_prio)
 375                 return false;
 376
 377         /*
 378          * Check against the first request in ELSP[1], it will, thanks to the
 379          * power of PI, be the highest priority of that context.
 380          */
 381         if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
 382             rq_prio(list_next_entry(rq, sched.link)) > last_prio)
 383                 return true;
 384
 385         if (rb) {
 386                 struct virtual_engine *ve =
 387                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
 388                 bool preempt = false;
 389
 390                 if (engine == ve->siblings[0]) { /* only preempt one sibling */
 391                         struct i915_request *next;
 392
 393                         rcu_read_lock();
 394                         next = READ_ONCE(ve->request);
 395                         if (next)
 396                                 preempt = rq_prio(next) > last_prio;
 397                         rcu_read_unlock();
 398                 }
 399
 400                 if (preempt)
 401                         return preempt;
 402         }
 403
 404         /*
 405          * If the inflight context did not trigger the preemption, then maybe
 406          * it was the set of queued requests? Pick the highest priority in
 407          * the queue (the first active priolist) and see if it deserves to be
 408          * running instead of ELSP[0].
 409          *
 410          * The highest priority request in the queue can not be either
 411          * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
 412          * context, it's priority would not exceed ELSP[0] aka last_prio.
 413          */
 414         return queue_prio(&engine->execlists) > last_prio;
 415 }
 416
 417 __maybe_unused static inline bool
 418 assert_priority_queue(const struct i915_request *prev,
 419                       const struct i915_request *next)
 420 {
 421         /*
 422          * Without preemption, the prev may refer to the still active element
 423          * which we refuse to let go.
 424          *
 425          * Even with preemption, there are times when we think it is better not
 426          * to preempt and leave an ostensibly lower priority request in flight.
 427          */
 428         if (i915_request_is_active(prev))
 429                 return true;
 430
 431         return rq_prio(prev) >= rq_prio(next);
 432 }
 433
 434 /*
 435  * The context descriptor encodes various attributes of a context,
 436  * including its GTT address and some flags. Because it's fairly
 437  * expensive to calculate, we'll just do it once and cache the result,
 438  * which remains valid until the context is unpinned.
 439  *
 440  * This is what a descriptor looks like, from LSB to MSB::
 441  *
 442  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
 443  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
 444  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
 445  *      bits 53-54:    mbz, reserved for use by hardware
 446  *      bits 55-63:    group ID, currently unused and set to 0
 447  *
 448  * Starting from Gen11, the upper dword of the descriptor has a new format:
 449  *
 450  *      bits 32-36:    reserved
 451  *      bits 37-47:    SW context ID
 452  *      bits 48:53:    engine instance
 453  *      bit 54:        mbz, reserved for use by hardware
 454  *      bits 55-60:    SW counter
 455  *      bits 61-63:    engine class
 456  *
 457  * engine info, SW context ID and SW counter need to form a unique number
 458  * (Context ID) per lrc.
 459  */
 460 static u64
 461 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
 462 {
 463         u64 desc;
 464
 465         desc = INTEL_LEGACY_32B_CONTEXT;
 466         if (i915_vm_is_4lvl(ce->vm))
 467                 desc = INTEL_LEGACY_64B_CONTEXT;
 468         desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
 469
 470         desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
 471         if (IS_GEN(engine->i915, 8))
 472                 desc |= GEN8_CTX_L3LLC_COHERENT;
 473
 474         desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */
 475         /*
 476          * The following 32bits are copied into the OA reports (dword 2).
 477          * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
 478          * anything below.
 479          */
 480         if (INTEL_GEN(engine->i915) >= 11) {
 481                 desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
 482                                                                 /* bits 48-53 */
 483
 484                 desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
 485                                                                 /* bits 61-63 */
 486         }
 487
 488         return desc;
 489 }
 490
 491 static inline unsigned int dword_in_page(void *addr)
 492 {
 493         return offset_in_page(addr) / sizeof(u32);
 494 }
 495
 496 static void set_offsets(u32 *regs,
 497                         const u8 *data,
 498                         const struct intel_engine_cs *engine,
 499                         bool clear)
 500 #define NOP(x) (BIT(7) | (x))
 501 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
 502 #define POSTED BIT(0)
 503 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
 504 #define REG16(x) \
 505         (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
 506         (((x) >> 2) & 0x7f)
 507 #define END(x) 0, (x)
 508 {
 509         const u32 base = engine->mmio_base;
 510
 511         while (*data) {
 512                 u8 count, flags;
 513
 514                 if (*data & BIT(7)) { /* skip */
 515                         count = *data++ & ~BIT(7);
 516                         if (clear)
 517                                 memset32(regs, MI_NOOP, count);
 518                         regs += count;
 519                         continue;
 520                 }
 521
 522                 count = *data & 0x3f;
 523                 flags = *data >> 6;
 524                 data++;
 525
 526                 *regs = MI_LOAD_REGISTER_IMM(count);
 527                 if (flags & POSTED)
 528                         *regs |= MI_LRI_FORCE_POSTED;
 529                 if (INTEL_GEN(engine->i915) >= 11)
 530                         *regs |= MI_LRI_CS_MMIO;
 531                 regs++;
 532
 533                 GEM_BUG_ON(!count);
 534                 do {
 535                         u32 offset = 0;
 536                         u8 v;
 537
 538                         do {
 539                                 v = *data++;
 540                                 offset <<= 7;
 541                                 offset |= v & ~BIT(7);
 542                         } while (v & BIT(7));
 543
 544                         regs[0] = base + (offset << 2);
 545                         if (clear)
 546                                 regs[1] = 0;
 547                         regs += 2;
 548                 } while (--count);
 549         }
 550
 551         if (clear) {
 552                 u8 count = *++data;
 553
 554                 /* Clear past the tail for HW access */
 555                 GEM_BUG_ON(dword_in_page(regs) > count);
 556                 memset32(regs, MI_NOOP, count - dword_in_page(regs));
 557
 558                 /* Close the batch; used mainly by live_lrc_layout() */
 559                 *regs = MI_BATCH_BUFFER_END;
 560                 if (INTEL_GEN(engine->i915) >= 10)
 561                         *regs |= BIT(0);
 562         }
 563 }
 564
 565 static const u8 gen8_xcs_offsets[] = {
 566         NOP(1),
 567         LRI(11, 0),
 568         REG16(0x244),
 569         REG(0x034),
 570         REG(0x030),
 571         REG(0x038),
 572         REG(0x03c),
 573         REG(0x168),
 574         REG(0x140),
 575         REG(0x110),
 576         REG(0x11c),
 577         REG(0x114),
 578         REG(0x118),
 579
 580         NOP(9),
 581         LRI(9, 0),
 582         REG16(0x3a8),
 583         REG16(0x28c),
 584         REG16(0x288),
 585         REG16(0x284),
 586         REG16(0x280),
 587         REG16(0x27c),
 588         REG16(0x278),
 589         REG16(0x274),
 590         REG16(0x270),
 591
 592         NOP(13),
 593         LRI(2, 0),
 594         REG16(0x200),
 595         REG(0x028),
 596
 597         END(80)
 598 };
 599
 600 static const u8 gen9_xcs_offsets[] = {
 601         NOP(1),
 602         LRI(14, POSTED),
 603         REG16(0x244),
 604         REG(0x034),
 605         REG(0x030),
 606         REG(0x038),
 607         REG(0x03c),
 608         REG(0x168),
 609         REG(0x140),
 610         REG(0x110),
 611         REG(0x11c),
 612         REG(0x114),
 613         REG(0x118),
 614         REG(0x1c0),
 615         REG(0x1c4),
 616         REG(0x1c8),
 617
 618         NOP(3),
 619         LRI(9, POSTED),
 620         REG16(0x3a8),
 621         REG16(0x28c),
 622         REG16(0x288),
 623         REG16(0x284),
 624         REG16(0x280),
 625         REG16(0x27c),
 626         REG16(0x278),
 627         REG16(0x274),
 628         REG16(0x270),
 629
 630         NOP(13),
 631         LRI(1, POSTED),
 632         REG16(0x200),
 633
 634         NOP(13),
 635         LRI(44, POSTED),
 636         REG(0x028),
 637         REG(0x09c),
 638         REG(0x0c0),
 639         REG(0x178),
 640         REG(0x17c),
 641         REG16(0x358),
 642         REG(0x170),
 643         REG(0x150),
 644         REG(0x154),
 645         REG(0x158),
 646         REG16(0x41c),
 647         REG16(0x600),
 648         REG16(0x604),
 649         REG16(0x608),
 650         REG16(0x60c),
 651         REG16(0x610),
 652         REG16(0x614),
 653         REG16(0x618),
 654         REG16(0x61c),
 655         REG16(0x620),
 656         REG16(0x624),
 657         REG16(0x628),
 658         REG16(0x62c),
 659         REG16(0x630),
 660         REG16(0x634),
 661         REG16(0x638),
 662         REG16(0x63c),
 663         REG16(0x640),
 664         REG16(0x644),
 665         REG16(0x648),
 666         REG16(0x64c),
 667         REG16(0x650),
 668         REG16(0x654),
 669         REG16(0x658),
 670         REG16(0x65c),
 671         REG16(0x660),
 672         REG16(0x664),
 673         REG16(0x668),
 674         REG16(0x66c),
 675         REG16(0x670),
 676         REG16(0x674),
 677         REG16(0x678),
 678         REG16(0x67c),
 679         REG(0x068),
 680
 681         END(176)
 682 };
 683
 684 static const u8 gen12_xcs_offsets[] = {
 685         NOP(1),
 686         LRI(13, POSTED),
 687         REG16(0x244),
 688         REG(0x034),
 689         REG(0x030),
 690         REG(0x038),
 691         REG(0x03c),
 692         REG(0x168),
 693         REG(0x140),
 694         REG(0x110),
 695         REG(0x1c0),
 696         REG(0x1c4),
 697         REG(0x1c8),
 698         REG(0x180),
 699         REG16(0x2b4),
 700
 701         NOP(5),
 702         LRI(9, POSTED),
 703         REG16(0x3a8),
 704         REG16(0x28c),
 705         REG16(0x288),
 706         REG16(0x284),
 707         REG16(0x280),
 708         REG16(0x27c),
 709         REG16(0x278),
 710         REG16(0x274),
 711         REG16(0x270),
 712
 713         END(80)
 714 };
 715
 716 static const u8 gen8_rcs_offsets[] = {
 717         NOP(1),
 718         LRI(14, POSTED),
 719         REG16(0x244),
 720         REG(0x034),
 721         REG(0x030),
 722         REG(0x038),
 723         REG(0x03c),
 724         REG(0x168),
 725         REG(0x140),
 726         REG(0x110),
 727         REG(0x11c),
 728         REG(0x114),
 729         REG(0x118),
 730         REG(0x1c0),
 731         REG(0x1c4),
 732         REG(0x1c8),
 733
 734         NOP(3),
 735         LRI(9, POSTED),
 736         REG16(0x3a8),
 737         REG16(0x28c),
 738         REG16(0x288),
 739         REG16(0x284),
 740         REG16(0x280),
 741         REG16(0x27c),
 742         REG16(0x278),
 743         REG16(0x274),
 744         REG16(0x270),
 745
 746         NOP(13),
 747         LRI(1, 0),
 748         REG(0x0c8),
 749
 750         END(80)
 751 };
 752
 753 static const u8 gen9_rcs_offsets[] = {
 754         NOP(1),
 755         LRI(14, POSTED),
 756         REG16(0x244),
 757         REG(0x34),
 758         REG(0x30),
 759         REG(0x38),
 760         REG(0x3c),
 761         REG(0x168),
 762         REG(0x140),
 763         REG(0x110),
 764         REG(0x11c),
 765         REG(0x114),
 766         REG(0x118),
 767         REG(0x1c0),
 768         REG(0x1c4),
 769         REG(0x1c8),
 770
 771         NOP(3),
 772         LRI(9, POSTED),
 773         REG16(0x3a8),
 774         REG16(0x28c),
 775         REG16(0x288),
 776         REG16(0x284),
 777         REG16(0x280),
 778         REG16(0x27c),
 779         REG16(0x278),
 780         REG16(0x274),
 781         REG16(0x270),
 782
 783         NOP(13),
 784         LRI(1, 0),
 785         REG(0xc8),
 786
 787         NOP(13),
 788         LRI(44, POSTED),
 789         REG(0x28),
 790         REG(0x9c),
 791         REG(0xc0),
 792         REG(0x178),
 793         REG(0x17c),
 794         REG16(0x358),
 795         REG(0x170),
 796         REG(0x150),
 797         REG(0x154),
 798         REG(0x158),
 799         REG16(0x41c),
 800         REG16(0x600),
 801         REG16(0x604),
 802         REG16(0x608),
 803         REG16(0x60c),
 804         REG16(0x610),
 805         REG16(0x614),
 806         REG16(0x618),
 807         REG16(0x61c),
 808         REG16(0x620),
 809         REG16(0x624),
 810         REG16(0x628),
 811         REG16(0x62c),
 812         REG16(0x630),
 813         REG16(0x634),
 814         REG16(0x638),
 815         REG16(0x63c),
 816         REG16(0x640),
 817         REG16(0x644),
 818         REG16(0x648),
 819         REG16(0x64c),
 820         REG16(0x650),
 821         REG16(0x654),
 822         REG16(0x658),
 823         REG16(0x65c),
 824         REG16(0x660),
 825         REG16(0x664),
 826         REG16(0x668),
 827         REG16(0x66c),
 828         REG16(0x670),
 829         REG16(0x674),
 830         REG16(0x678),
 831         REG16(0x67c),
 832         REG(0x68),
 833
 834         END(176)
 835 };
 836
 837 static const u8 gen11_rcs_offsets[] = {
 838         NOP(1),
 839         LRI(15, POSTED),
 840         REG16(0x244),
 841         REG(0x034),
 842         REG(0x030),
 843         REG(0x038),
 844         REG(0x03c),
 845         REG(0x168),
 846         REG(0x140),
 847         REG(0x110),
 848         REG(0x11c),
 849         REG(0x114),
 850         REG(0x118),
 851         REG(0x1c0),
 852         REG(0x1c4),
 853         REG(0x1c8),
 854         REG(0x180),
 855
 856         NOP(1),
 857         LRI(9, POSTED),
 858         REG16(0x3a8),
 859         REG16(0x28c),
 860         REG16(0x288),
 861         REG16(0x284),
 862         REG16(0x280),
 863         REG16(0x27c),
 864         REG16(0x278),
 865         REG16(0x274),
 866         REG16(0x270),
 867
 868         LRI(1, POSTED),
 869         REG(0x1b0),
 870
 871         NOP(10),
 872         LRI(1, 0),
 873         REG(0x0c8),
 874
 875         END(80)
 876 };
 877
 878 static const u8 gen12_rcs_offsets[] = {
 879         NOP(1),
 880         LRI(13, POSTED),
 881         REG16(0x244),
 882         REG(0x034),
 883         REG(0x030),
 884         REG(0x038),
 885         REG(0x03c),
 886         REG(0x168),
 887         REG(0x140),
 888         REG(0x110),
 889         REG(0x1c0),
 890         REG(0x1c4),
 891         REG(0x1c8),
 892         REG(0x180),
 893         REG16(0x2b4),
 894
 895         NOP(5),
 896         LRI(9, POSTED),
 897         REG16(0x3a8),
 898         REG16(0x28c),
 899         REG16(0x288),
 900         REG16(0x284),
 901         REG16(0x280),
 902         REG16(0x27c),
 903         REG16(0x278),
 904         REG16(0x274),
 905         REG16(0x270),
 906
 907         LRI(3, POSTED),
 908         REG(0x1b0),
 909         REG16(0x5a8),
 910         REG16(0x5ac),
 911
 912         NOP(6),
 913         LRI(1, 0),
 914         REG(0x0c8),
 915
 916         END(80)
 917 };
 918
 919 #undef END
 920 #undef REG16
 921 #undef REG
 922 #undef LRI
 923 #undef NOP
 924
 925 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
 926 {
 927         /*
 928          * The gen12+ lists only have the registers we program in the basic
 929          * default state. We rely on the context image using relative
 930          * addressing to automatic fixup the register state between the
 931          * physical engines for virtual engine.
 932          */
 933         GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
 934                    !intel_engine_has_relative_mmio(engine));
 935
 936         if (engine->class == RENDER_CLASS) {
 937                 if (INTEL_GEN(engine->i915) >= 12)
 938                         return gen12_rcs_offsets;
 939                 else if (INTEL_GEN(engine->i915) >= 11)
 940                         return gen11_rcs_offsets;
 941                 else if (INTEL_GEN(engine->i915) >= 9)
 942                         return gen9_rcs_offsets;
 943                 else
 944                         return gen8_rcs_offsets;
 945         } else {
 946                 if (INTEL_GEN(engine->i915) >= 12)
 947                         return gen12_xcs_offsets;
 948                 else if (INTEL_GEN(engine->i915) >= 9)
 949                         return gen9_xcs_offsets;
 950                 else
 951                         return gen8_xcs_offsets;
 952         }
 953 }
 954
 955 static struct i915_request *
 956 __unwind_incomplete_requests(struct intel_engine_cs *engine)
 957 {
 958         struct i915_request *rq, *rn, *active = NULL;
 959         struct list_head *uninitialized_var(pl);
 960         int prio = I915_PRIORITY_INVALID;
 961
 962         lockdep_assert_held(&engine->active.lock);
 963
 964         list_for_each_entry_safe_reverse(rq, rn,
 965                                          &engine->active.requests,
 966                                          sched.link) {
 967                 if (i915_request_completed(rq))
 968                         continue; /* XXX */
 969
 970                 __i915_request_unsubmit(rq);
 971
 972                 /*
 973                  * Push the request back into the queue for later resubmission.
 974                  * If this request is not native to this physical engine (i.e.
 975                  * it came from a virtual source), push it back onto the virtual
 976                  * engine so that it can be moved across onto another physical
 977                  * engine as load dictates.
 978                  */
 979                 if (likely(rq->execution_mask == engine->mask)) {
 980                         GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
 981                         if (rq_prio(rq) != prio) {
 982                                 prio = rq_prio(rq);
 983                                 pl = i915_sched_lookup_priolist(engine, prio);
 984                         }
 985                         GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
 986
 987                         list_move(&rq->sched.link, pl);
 988                         active = rq;
 989                 } else {
 990                         struct intel_engine_cs *owner = rq->context->engine;
 991
 992                         /*
 993                          * Decouple the virtual breadcrumb before moving it
 994                          * back to the virtual engine -- we don't want the
 995                          * request to complete in the background and try
 996                          * and cancel the breadcrumb on the virtual engine
 997                          * (instead of the old engine where it is linked)!
 998                          */
 999                         if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
1000                                      &rq->fence.flags)) {
1001                                 spin_lock_nested(&rq->lock,
1002                                                  SINGLE_DEPTH_NESTING);
1003                                 i915_request_cancel_breadcrumb(rq);
1004                                 spin_unlock(&rq->lock);
1005                         }
1006                         rq->engine = owner;
1007                         owner->submit_request(rq);
1008                         active = NULL;
1009                 }
1010         }
1011
1012         return active;
1013 }
1014
1015 struct i915_request *
1016 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1017 {
1018         struct intel_engine_cs *engine =
1019                 container_of(execlists, typeof(*engine), execlists);
1020
1021         return __unwind_incomplete_requests(engine);
1022 }
1023
1024 static inline void
1025 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1026 {
1027         /*
1028          * Only used when GVT-g is enabled now. When GVT-g is disabled,
1029          * The compiler should eliminate this function as dead-code.
1030          */
1031         if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1032                 return;
1033
1034         atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1035                                    status, rq);
1036 }
1037
1038 static void intel_engine_context_in(struct intel_engine_cs *engine)
1039 {
1040         unsigned long flags;
1041
1042         if (READ_ONCE(engine->stats.enabled) == 0)
1043                 return;
1044
1045         write_seqlock_irqsave(&engine->stats.lock, flags);
1046
1047         if (engine->stats.enabled > 0) {
1048                 if (engine->stats.active++ == 0)
1049                         engine->stats.start = ktime_get();
1050                 GEM_BUG_ON(engine->stats.active == 0);
1051         }
1052
1053         write_sequnlock_irqrestore(&engine->stats.lock, flags);
1054 }
1055
1056 static void intel_engine_context_out(struct intel_engine_cs *engine)
1057 {
1058         unsigned long flags;
1059
1060         if (READ_ONCE(engine->stats.enabled) == 0)
1061                 return;
1062
1063         write_seqlock_irqsave(&engine->stats.lock, flags);
1064
1065         if (engine->stats.enabled > 0) {
1066                 ktime_t last;
1067
1068                 if (engine->stats.active && --engine->stats.active == 0) {
1069                         /*
1070                          * Decrement the active context count and in case GPU
1071                          * is now idle add up to the running total.
1072                          */
1073                         last = ktime_sub(ktime_get(), engine->stats.start);
1074
1075                         engine->stats.total = ktime_add(engine->stats.total,
1076                                                         last);
1077                 } else if (engine->stats.active == 0) {
1078                         /*
1079                          * After turning on engine stats, context out might be
1080                          * the first event in which case we account from the
1081                          * time stats gathering was turned on.
1082                          */
1083                         last = ktime_sub(ktime_get(), engine->stats.enabled_at);
1084
1085                         engine->stats.total = ktime_add(engine->stats.total,
1086                                                         last);
1087                 }
1088         }
1089
1090         write_sequnlock_irqrestore(&engine->stats.lock, flags);
1091 }
1092
1093 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
1094 {
1095         if (INTEL_GEN(engine->i915) >= 12)
1096                 return 0x60;
1097         else if (INTEL_GEN(engine->i915) >= 9)
1098                 return 0x54;
1099         else if (engine->class == RENDER_CLASS)
1100                 return 0x58;
1101         else
1102                 return -1;
1103 }
1104
1105 static void
1106 execlists_check_context(const struct intel_context *ce,
1107                         const struct intel_engine_cs *engine)
1108 {
1109         const struct intel_ring *ring = ce->ring;
1110         u32 *regs = ce->lrc_reg_state;
1111         bool valid = true;
1112         int x;
1113
1114         if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1115                 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1116                        engine->name,
1117                        regs[CTX_RING_START],
1118                        i915_ggtt_offset(ring->vma));
1119                 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1120                 valid = false;
1121         }
1122
1123         if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1124             (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1125                 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1126                        engine->name,
1127                        regs[CTX_RING_CTL],
1128                        (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1129                 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1130                 valid = false;
1131         }
1132
1133         x = lrc_ring_mi_mode(engine);
1134         if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1135                 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1136                        engine->name, regs[x + 1]);
1137                 regs[x + 1] &= ~STOP_RING;
1138                 regs[x + 1] |= STOP_RING << 16;
1139                 valid = false;
1140         }
1141
1142         WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1143 }
1144
1145 static void restore_default_state(struct intel_context *ce,
1146                                   struct intel_engine_cs *engine)
1147 {
1148         u32 *regs = ce->lrc_reg_state;
1149
1150         if (engine->pinned_default_state)
1151                 memcpy(regs, /* skip restoring the vanilla PPHWSP */
1152                        engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
1153                        engine->context_size - PAGE_SIZE);
1154
1155         execlists_init_reg_state(regs, ce, engine, ce->ring, false);
1156 }
1157
1158 static void reset_active(struct i915_request *rq,
1159                          struct intel_engine_cs *engine)
1160 {
1161         struct intel_context * const ce = rq->context;
1162         u32 head;
1163
1164         /*
1165          * The executing context has been cancelled. We want to prevent
1166          * further execution along this context and propagate the error on
1167          * to anything depending on its results.
1168          *
1169          * In __i915_request_submit(), we apply the -EIO and remove the
1170          * requests' payloads for any banned requests. But first, we must
1171          * rewind the context back to the start of the incomplete request so
1172          * that we do not jump back into the middle of the batch.
1173          *
1174          * We preserve the breadcrumbs and semaphores of the incomplete
1175          * requests so that inter-timeline dependencies (i.e other timelines)
1176          * remain correctly ordered. And we defer to __i915_request_submit()
1177          * so that all asynchronous waits are correctly handled.
1178          */
1179         ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1180                      rq->fence.context, rq->fence.seqno);
1181
1182         /* On resubmission of the active request, payload will be scrubbed */
1183         if (i915_request_completed(rq))
1184                 head = rq->tail;
1185         else
1186                 head = active_request(ce->timeline, rq)->head;
1187         ce->ring->head = intel_ring_wrap(ce->ring, head);
1188         intel_ring_update_space(ce->ring);
1189
1190         /* Scrub the context image to prevent replaying the previous batch */
1191         restore_default_state(ce, engine);
1192         __execlists_update_reg_state(ce, engine);
1193
1194         /* We've switched away, so this should be a no-op, but intent matters */
1195         ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1196 }
1197
1198 static inline struct intel_engine_cs *
1199 __execlists_schedule_in(struct i915_request *rq)
1200 {
1201         struct intel_engine_cs * const engine = rq->engine;
1202         struct intel_context * const ce = rq->context;
1203
1204         intel_context_get(ce);
1205
1206         if (unlikely(intel_context_is_banned(ce)))
1207                 reset_active(rq, engine);
1208
1209         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1210                 execlists_check_context(ce, engine);
1211
1212         if (ce->tag) {
1213                 /* Use a fixed tag for OA and friends */
1214                 ce->lrc_desc |= (u64)ce->tag << 32;
1215         } else {
1216                 /* We don't need a strict matching tag, just different values */
1217                 ce->lrc_desc &= ~GENMASK_ULL(47, 37);
1218                 ce->lrc_desc |=
1219                         (u64)(++engine->context_tag % NUM_CONTEXT_TAG) <<
1220                         GEN11_SW_CTX_ID_SHIFT;
1221                 BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID);
1222         }
1223
1224         __intel_gt_pm_get(engine->gt);
1225         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1226         intel_engine_context_in(engine);
1227
1228         return engine;
1229 }
1230
1231 static inline struct i915_request *
1232 execlists_schedule_in(struct i915_request *rq, int idx)
1233 {
1234         struct intel_context * const ce = rq->context;
1235         struct intel_engine_cs *old;
1236
1237         GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1238         trace_i915_request_in(rq, idx);
1239
1240         old = READ_ONCE(ce->inflight);
1241         do {
1242                 if (!old) {
1243                         WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1244                         break;
1245                 }
1246         } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1247
1248         GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1249         return i915_request_get(rq);
1250 }
1251
1252 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1253 {
1254         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1255         struct i915_request *next = READ_ONCE(ve->request);
1256
1257         if (next && next->execution_mask & ~rq->execution_mask)
1258                 tasklet_schedule(&ve->base.execlists.tasklet);
1259 }
1260
1261 static inline void
1262 __execlists_schedule_out(struct i915_request *rq,
1263                          struct intel_engine_cs * const engine)
1264 {
1265         struct intel_context * const ce = rq->context;
1266
1267         /*
1268          * NB process_csb() is not under the engine->active.lock and hence
1269          * schedule_out can race with schedule_in meaning that we should
1270          * refrain from doing non-trivial work here.
1271          */
1272
1273         /*
1274          * If we have just completed this context, the engine may now be
1275          * idle and we want to re-enter powersaving.
1276          */
1277         if (list_is_last(&rq->link, &ce->timeline->requests) &&
1278             i915_request_completed(rq))
1279                 intel_engine_add_retire(engine, ce->timeline);
1280
1281         intel_engine_context_out(engine);
1282         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1283         intel_gt_pm_put_async(engine->gt);
1284
1285         /*
1286          * If this is part of a virtual engine, its next request may
1287          * have been blocked waiting for access to the active context.
1288          * We have to kick all the siblings again in case we need to
1289          * switch (e.g. the next request is not runnable on this
1290          * engine). Hopefully, we will already have submitted the next
1291          * request before the tasklet runs and do not need to rebuild
1292          * each virtual tree and kick everyone again.
1293          */
1294         if (ce->engine != engine)
1295                 kick_siblings(rq, ce);
1296
1297         intel_context_put(ce);
1298 }
1299
1300 static inline void
1301 execlists_schedule_out(struct i915_request *rq)
1302 {
1303         struct intel_context * const ce = rq->context;
1304         struct intel_engine_cs *cur, *old;
1305
1306         trace_i915_request_out(rq);
1307
1308         old = READ_ONCE(ce->inflight);
1309         do
1310                 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1311         while (!try_cmpxchg(&ce->inflight, &old, cur));
1312         if (!cur)
1313                 __execlists_schedule_out(rq, old);
1314
1315         i915_request_put(rq);
1316 }
1317
1318 static u64 execlists_update_context(struct i915_request *rq)
1319 {
1320         struct intel_context *ce = rq->context;
1321         u64 desc = ce->lrc_desc;
1322         u32 tail;
1323
1324         /*
1325          * WaIdleLiteRestore:bdw,skl
1326          *
1327          * We should never submit the context with the same RING_TAIL twice
1328          * just in case we submit an empty ring, which confuses the HW.
1329          *
1330          * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1331          * the normal request to be able to always advance the RING_TAIL on
1332          * subsequent resubmissions (for lite restore). Should that fail us,
1333          * and we try and submit the same tail again, force the context
1334          * reload.
1335          */
1336         tail = intel_ring_set_tail(rq->ring, rq->tail);
1337         if (unlikely(ce->lrc_reg_state[CTX_RING_TAIL] == tail))
1338                 desc |= CTX_DESC_FORCE_RESTORE;
1339         ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1340         rq->tail = rq->wa_tail;
1341
1342         /*
1343          * Make sure the context image is complete before we submit it to HW.
1344          *
1345          * Ostensibly, writes (including the WCB) should be flushed prior to
1346          * an uncached write such as our mmio register access, the empirical
1347          * evidence (esp. on Braswell) suggests that the WC write into memory
1348          * may not be visible to the HW prior to the completion of the UC
1349          * register write and that we may begin execution from the context
1350          * before its image is complete leading to invalid PD chasing.
1351          */
1352         wmb();
1353
1354         ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE;
1355         return desc;
1356 }
1357
1358 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1359 {
1360         if (execlists->ctrl_reg) {
1361                 writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1362                 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1363         } else {
1364                 writel(upper_32_bits(desc), execlists->submit_reg);
1365                 writel(lower_32_bits(desc), execlists->submit_reg);
1366         }
1367 }
1368
1369 static __maybe_unused void
1370 trace_ports(const struct intel_engine_execlists *execlists,
1371             const char *msg,
1372             struct i915_request * const *ports)
1373 {
1374         const struct intel_engine_cs *engine =
1375                 container_of(execlists, typeof(*engine), execlists);
1376
1377         if (!ports[0])
1378                 return;
1379
1380         ENGINE_TRACE(engine, "%s { %llx:%lld%s, %llx:%lld }\n", msg,
1381                      ports[0]->fence.context,
1382                      ports[0]->fence.seqno,
1383                      i915_request_completed(ports[0]) ? "!" :
1384                      i915_request_started(ports[0]) ? "*" :
1385                      "",
1386                      ports[1] ? ports[1]->fence.context : 0,
1387                      ports[1] ? ports[1]->fence.seqno : 0);
1388 }
1389
1390 static __maybe_unused bool
1391 assert_pending_valid(const struct intel_engine_execlists *execlists,
1392                      const char *msg)
1393 {
1394         struct i915_request * const *port, *rq;
1395         struct intel_context *ce = NULL;
1396
1397         trace_ports(execlists, msg, execlists->pending);
1398
1399         if (!execlists->pending[0]) {
1400                 GEM_TRACE_ERR("Nothing pending for promotion!\n");
1401                 return false;
1402         }
1403
1404         if (execlists->pending[execlists_num_ports(execlists)]) {
1405                 GEM_TRACE_ERR("Excess pending[%d] for promotion!\n",
1406                               execlists_num_ports(execlists));
1407                 return false;
1408         }
1409
1410         for (port = execlists->pending; (rq = *port); port++) {
1411                 unsigned long flags;
1412                 bool ok = true;
1413
1414                 GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1415                 GEM_BUG_ON(!i915_request_is_active(rq));
1416
1417                 if (ce == rq->context) {
1418                         GEM_TRACE_ERR("Dup context:%llx in pending[%zd]\n",
1419                                       ce->timeline->fence_context,
1420                                       port - execlists->pending);
1421                         return false;
1422                 }
1423                 ce = rq->context;
1424
1425                 /* Hold tightly onto the lock to prevent concurrent retires! */
1426                 if (!spin_trylock_irqsave(&rq->lock, flags))
1427                         continue;
1428
1429                 if (i915_request_completed(rq))
1430                         goto unlock;
1431
1432                 if (i915_active_is_idle(&ce->active) &&
1433                     !intel_context_is_barrier(ce)) {
1434                         GEM_TRACE_ERR("Inactive context:%llx in pending[%zd]\n",
1435                                       ce->timeline->fence_context,
1436                                       port - execlists->pending);
1437                         ok = false;
1438                         goto unlock;
1439                 }
1440
1441                 if (!i915_vma_is_pinned(ce->state)) {
1442                         GEM_TRACE_ERR("Unpinned context:%llx in pending[%zd]\n",
1443                                       ce->timeline->fence_context,
1444                                       port - execlists->pending);
1445                         ok = false;
1446                         goto unlock;
1447                 }
1448
1449                 if (!i915_vma_is_pinned(ce->ring->vma)) {
1450                         GEM_TRACE_ERR("Unpinned ring:%llx in pending[%zd]\n",
1451                                       ce->timeline->fence_context,
1452                                       port - execlists->pending);
1453                         ok = false;
1454                         goto unlock;
1455                 }
1456
1457 unlock:
1458                 spin_unlock_irqrestore(&rq->lock, flags);
1459                 if (!ok)
1460                         return false;
1461         }
1462
1463         return ce;
1464 }
1465
1466 static void execlists_submit_ports(struct intel_engine_cs *engine)
1467 {
1468         struct intel_engine_execlists *execlists = &engine->execlists;
1469         unsigned int n;
1470
1471         GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1472
1473         /*
1474          * We can skip acquiring intel_runtime_pm_get() here as it was taken
1475          * on our behalf by the request (see i915_gem_mark_busy()) and it will
1476          * not be relinquished until the device is idle (see
1477          * i915_gem_idle_work_handler()). As a precaution, we make sure
1478          * that all ELSP are drained i.e. we have processed the CSB,
1479          * before allowing ourselves to idle and calling intel_runtime_pm_put().
1480          */
1481         GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1482
1483         /*
1484          * ELSQ note: the submit queue is not cleared after being submitted
1485          * to the HW so we need to make sure we always clean it up. This is
1486          * currently ensured by the fact that we always write the same number
1487          * of elsq entries, keep this in mind before changing the loop below.
1488          */
1489         for (n = execlists_num_ports(execlists); n--; ) {
1490                 struct i915_request *rq = execlists->pending[n];
1491
1492                 write_desc(execlists,
1493                            rq ? execlists_update_context(rq) : 0,
1494                            n);
1495         }
1496
1497         /* we need to manually load the submit queue */
1498         if (execlists->ctrl_reg)
1499                 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1500 }
1501
1502 static bool ctx_single_port_submission(const struct intel_context *ce)
1503 {
1504         return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1505                 intel_context_force_single_submission(ce));
1506 }
1507
1508 static bool can_merge_ctx(const struct intel_context *prev,
1509                           const struct intel_context *next)
1510 {
1511         if (prev != next)
1512                 return false;
1513
1514         if (ctx_single_port_submission(prev))
1515                 return false;
1516
1517         return true;
1518 }
1519
1520 static bool can_merge_rq(const struct i915_request *prev,
1521                          const struct i915_request *next)
1522 {
1523         GEM_BUG_ON(prev == next);
1524         GEM_BUG_ON(!assert_priority_queue(prev, next));
1525
1526         /*
1527          * We do not submit known completed requests. Therefore if the next
1528          * request is already completed, we can pretend to merge it in
1529          * with the previous context (and we will skip updating the ELSP
1530          * and tracking). Thus hopefully keeping the ELSP full with active
1531          * contexts, despite the best efforts of preempt-to-busy to confuse
1532          * us.
1533          */
1534         if (i915_request_completed(next))
1535                 return true;
1536
1537         if (unlikely((prev->fence.flags ^ next->fence.flags) &
1538                      (I915_FENCE_FLAG_NOPREEMPT | I915_FENCE_FLAG_SENTINEL)))
1539                 return false;
1540
1541         if (!can_merge_ctx(prev->context, next->context))
1542                 return false;
1543
1544         return true;
1545 }
1546
1547 static void virtual_update_register_offsets(u32 *regs,
1548                                             struct intel_engine_cs *engine)
1549 {
1550         set_offsets(regs, reg_offsets(engine), engine, false);
1551 }
1552
1553 static bool virtual_matches(const struct virtual_engine *ve,
1554                             const struct i915_request *rq,
1555                             const struct intel_engine_cs *engine)
1556 {
1557         const struct intel_engine_cs *inflight;
1558
1559         if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1560                 return false;
1561
1562         /*
1563          * We track when the HW has completed saving the context image
1564          * (i.e. when we have seen the final CS event switching out of
1565          * the context) and must not overwrite the context image before
1566          * then. This restricts us to only using the active engine
1567          * while the previous virtualized request is inflight (so
1568          * we reuse the register offsets). This is a very small
1569          * hystersis on the greedy seelction algorithm.
1570          */
1571         inflight = intel_context_inflight(&ve->context);
1572         if (inflight && inflight != engine)
1573                 return false;
1574
1575         return true;
1576 }
1577
1578 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
1579                                      struct intel_engine_cs *engine)
1580 {
1581         struct intel_engine_cs *old = ve->siblings[0];
1582
1583         /* All unattached (rq->engine == old) must already be completed */
1584
1585         spin_lock(&old->breadcrumbs.irq_lock);
1586         if (!list_empty(&ve->context.signal_link)) {
1587                 list_move_tail(&ve->context.signal_link,
1588                                &engine->breadcrumbs.signalers);
1589                 intel_engine_signal_breadcrumbs(engine);
1590         }
1591         spin_unlock(&old->breadcrumbs.irq_lock);
1592 }
1593
1594 static struct i915_request *
1595 last_active(const struct intel_engine_execlists *execlists)
1596 {
1597         struct i915_request * const *last = READ_ONCE(execlists->active);
1598
1599         while (*last && i915_request_completed(*last))
1600                 last++;
1601
1602         return *last;
1603 }
1604
1605 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1606 {
1607         LIST_HEAD(list);
1608
1609         /*
1610          * We want to move the interrupted request to the back of
1611          * the round-robin list (i.e. its priority level), but
1612          * in doing so, we must then move all requests that were in
1613          * flight and were waiting for the interrupted request to
1614          * be run after it again.
1615          */
1616         do {
1617                 struct i915_dependency *p;
1618
1619                 GEM_BUG_ON(i915_request_is_active(rq));
1620                 list_move_tail(&rq->sched.link, pl);
1621
1622                 list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
1623                         struct i915_request *w =
1624                                 container_of(p->waiter, typeof(*w), sched);
1625
1626                         /* Leave semaphores spinning on the other engines */
1627                         if (w->engine != rq->engine)
1628                                 continue;
1629
1630                         /* No waiter should start before its signaler */
1631                         GEM_BUG_ON(i915_request_started(w) &&
1632                                    !i915_request_completed(rq));
1633
1634                         GEM_BUG_ON(i915_request_is_active(w));
1635                         if (list_empty(&w->sched.link))
1636                                 continue; /* Not yet submitted; unready */
1637
1638                         if (rq_prio(w) < rq_prio(rq))
1639                                 continue;
1640
1641                         GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1642                         list_move_tail(&w->sched.link, &list);
1643                 }
1644
1645                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1646         } while (rq);
1647 }
1648
1649 static void defer_active(struct intel_engine_cs *engine)
1650 {
1651         struct i915_request *rq;
1652
1653         rq = __unwind_incomplete_requests(engine);
1654         if (!rq)
1655                 return;
1656
1657         defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1658 }
1659
1660 static bool
1661 need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
1662 {
1663         int hint;
1664
1665         if (!intel_engine_has_timeslices(engine))
1666                 return false;
1667
1668         if (list_is_last(&rq->sched.link, &engine->active.requests))
1669                 return false;
1670
1671         hint = max(rq_prio(list_next_entry(rq, sched.link)),
1672                    engine->execlists.queue_priority_hint);
1673
1674         return hint >= effective_prio(rq);
1675 }
1676
1677 static int
1678 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1679 {
1680         if (list_is_last(&rq->sched.link, &engine->active.requests))
1681                 return INT_MIN;
1682
1683         return rq_prio(list_next_entry(rq, sched.link));
1684 }
1685
1686 static inline unsigned long
1687 timeslice(const struct intel_engine_cs *engine)
1688 {
1689         return READ_ONCE(engine->props.timeslice_duration_ms);
1690 }
1691
1692 static unsigned long
1693 active_timeslice(const struct intel_engine_cs *engine)
1694 {
1695         const struct i915_request *rq = *engine->execlists.active;
1696
1697         if (!rq || i915_request_completed(rq))
1698                 return 0;
1699
1700         if (engine->execlists.switch_priority_hint < effective_prio(rq))
1701                 return 0;
1702
1703         return timeslice(engine);
1704 }
1705
1706 static void set_timeslice(struct intel_engine_cs *engine)
1707 {
1708         if (!intel_engine_has_timeslices(engine))
1709                 return;
1710
1711         set_timer_ms(&engine->execlists.timer, active_timeslice(engine));
1712 }
1713
1714 static void record_preemption(struct intel_engine_execlists *execlists)
1715 {
1716         (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
1717 }
1718
1719 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine)
1720 {
1721         struct i915_request *rq;
1722
1723         rq = last_active(&engine->execlists);
1724         if (!rq)
1725                 return 0;
1726
1727         /* Force a fast reset for terminated contexts (ignoring sysfs!) */
1728         if (unlikely(intel_context_is_banned(rq->context)))
1729                 return 1;
1730
1731         return READ_ONCE(engine->props.preempt_timeout_ms);
1732 }
1733
1734 static void set_preempt_timeout(struct intel_engine_cs *engine)
1735 {
1736         if (!intel_engine_has_preempt_reset(engine))
1737                 return;
1738
1739         set_timer_ms(&engine->execlists.preempt,
1740                      active_preempt_timeout(engine));
1741 }
1742
1743 static inline void clear_ports(struct i915_request **ports, int count)
1744 {
1745         memset_p((void **)ports, NULL, count);
1746 }
1747
1748 static void execlists_dequeue(struct intel_engine_cs *engine)
1749 {
1750         struct intel_engine_execlists * const execlists = &engine->execlists;
1751         struct i915_request **port = execlists->pending;
1752         struct i915_request ** const last_port = port + execlists->port_mask;
1753         struct i915_request *last;
1754         struct rb_node *rb;
1755         bool submit = false;
1756
1757         /*
1758          * Hardware submission is through 2 ports. Conceptually each port
1759          * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
1760          * static for a context, and unique to each, so we only execute
1761          * requests belonging to a single context from each ring. RING_HEAD
1762          * is maintained by the CS in the context image, it marks the place
1763          * where it got up to last time, and through RING_TAIL we tell the CS
1764          * where we want to execute up to this time.
1765          *
1766          * In this list the requests are in order of execution. Consecutive
1767          * requests from the same context are adjacent in the ringbuffer. We
1768          * can combine these requests into a single RING_TAIL update:
1769          *
1770          *              RING_HEAD...req1...req2
1771          *                                    ^- RING_TAIL
1772          * since to execute req2 the CS must first execute req1.
1773          *
1774          * Our goal then is to point each port to the end of a consecutive
1775          * sequence of requests as being the most optimal (fewest wake ups
1776          * and context switches) submission.
1777          */
1778
1779         for (rb = rb_first_cached(&execlists->virtual); rb; ) {
1780                 struct virtual_engine *ve =
1781                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1782                 struct i915_request *rq = READ_ONCE(ve->request);
1783
1784                 if (!rq) { /* lazily cleanup after another engine handled rq */
1785                         rb_erase_cached(rb, &execlists->virtual);
1786                         RB_CLEAR_NODE(rb);
1787                         rb = rb_first_cached(&execlists->virtual);
1788                         continue;
1789                 }
1790
1791                 if (!virtual_matches(ve, rq, engine)) {
1792                         rb = rb_next(rb);
1793                         continue;
1794                 }
1795
1796                 break;
1797         }
1798
1799         /*
1800          * If the queue is higher priority than the last
1801          * request in the currently active context, submit afresh.
1802          * We will resubmit again afterwards in case we need to split
1803          * the active context to interject the preemption request,
1804          * i.e. we will retrigger preemption following the ack in case
1805          * of trouble.
1806          */
1807         last = last_active(execlists);
1808         if (last) {
1809                 if (need_preempt(engine, last, rb)) {
1810                         ENGINE_TRACE(engine,
1811                                      "preempting last=%llx:%lld, prio=%d, hint=%d\n",
1812                                      last->fence.context,
1813                                      last->fence.seqno,
1814                                      last->sched.attr.priority,
1815                                      execlists->queue_priority_hint);
1816                         record_preemption(execlists);
1817
1818                         /*
1819                          * Don't let the RING_HEAD advance past the breadcrumb
1820                          * as we unwind (and until we resubmit) so that we do
1821                          * not accidentally tell it to go backwards.
1822                          */
1823                         ring_set_paused(engine, 1);
1824
1825                         /*
1826                          * Note that we have not stopped the GPU at this point,
1827                          * so we are unwinding the incomplete requests as they
1828                          * remain inflight and so by the time we do complete
1829                          * the preemption, some of the unwound requests may
1830                          * complete!
1831                          */
1832                         __unwind_incomplete_requests(engine);
1833
1834                         /*
1835                          * If we need to return to the preempted context, we
1836                          * need to skip the lite-restore and force it to
1837                          * reload the RING_TAIL. Otherwise, the HW has a
1838                          * tendency to ignore us rewinding the TAIL to the
1839                          * end of an earlier request.
1840                          */
1841                         last->context->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1842                         last = NULL;
1843                 } else if (need_timeslice(engine, last) &&
1844                            timer_expired(&engine->execlists.timer)) {
1845                         ENGINE_TRACE(engine,
1846                                      "expired last=%llx:%lld, prio=%d, hint=%d\n",
1847                                      last->fence.context,
1848                                      last->fence.seqno,
1849                                      last->sched.attr.priority,
1850                                      execlists->queue_priority_hint);
1851
1852                         ring_set_paused(engine, 1);
1853                         defer_active(engine);
1854
1855                         /*
1856                          * Unlike for preemption, if we rewind and continue
1857                          * executing the same context as previously active,
1858                          * the order of execution will remain the same and
1859                          * the tail will only advance. We do not need to
1860                          * force a full context restore, as a lite-restore
1861                          * is sufficient to resample the monotonic TAIL.
1862                          *
1863                          * If we switch to any other context, similarly we
1864                          * will not rewind TAIL of current context, and
1865                          * normal save/restore will preserve state and allow
1866                          * us to later continue executing the same request.
1867                          */
1868                         last = NULL;
1869                 } else {
1870                         /*
1871                          * Otherwise if we already have a request pending
1872                          * for execution after the current one, we can
1873                          * just wait until the next CS event before
1874                          * queuing more. In either case we will force a
1875                          * lite-restore preemption event, but if we wait
1876                          * we hopefully coalesce several updates into a single
1877                          * submission.
1878                          */
1879                         if (!list_is_last(&last->sched.link,
1880                                           &engine->active.requests)) {
1881                                 /*
1882                                  * Even if ELSP[1] is occupied and not worthy
1883                                  * of timeslices, our queue might be.
1884                                  */
1885                                 if (!execlists->timer.expires &&
1886                                     need_timeslice(engine, last))
1887                                         set_timer_ms(&execlists->timer,
1888                                                      timeslice(engine));
1889
1890                                 return;
1891                         }
1892                 }
1893         }
1894
1895         while (rb) { /* XXX virtual is always taking precedence */
1896                 struct virtual_engine *ve =
1897                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1898                 struct i915_request *rq;
1899
1900                 spin_lock(&ve->base.active.lock);
1901
1902                 rq = ve->request;
1903                 if (unlikely(!rq)) { /* lost the race to a sibling */
1904                         spin_unlock(&ve->base.active.lock);
1905                         rb_erase_cached(rb, &execlists->virtual);
1906                         RB_CLEAR_NODE(rb);
1907                         rb = rb_first_cached(&execlists->virtual);
1908                         continue;
1909                 }
1910
1911                 GEM_BUG_ON(rq != ve->request);
1912                 GEM_BUG_ON(rq->engine != &ve->base);
1913                 GEM_BUG_ON(rq->context != &ve->context);
1914
1915                 if (rq_prio(rq) >= queue_prio(execlists)) {
1916                         if (!virtual_matches(ve, rq, engine)) {
1917                                 spin_unlock(&ve->base.active.lock);
1918                                 rb = rb_next(rb);
1919                                 continue;
1920                         }
1921
1922                         if (last && !can_merge_rq(last, rq)) {
1923                                 spin_unlock(&ve->base.active.lock);
1924                                 return; /* leave this for another */
1925                         }
1926
1927                         ENGINE_TRACE(engine,
1928                                      "virtual rq=%llx:%lld%s, new engine? %s\n",
1929                                      rq->fence.context,
1930                                      rq->fence.seqno,
1931                                      i915_request_completed(rq) ? "!" :
1932                                      i915_request_started(rq) ? "*" :
1933                                      "",
1934                                      yesno(engine != ve->siblings[0]));
1935
1936                         ve->request = NULL;
1937                         ve->base.execlists.queue_priority_hint = INT_MIN;
1938                         rb_erase_cached(rb, &execlists->virtual);
1939                         RB_CLEAR_NODE(rb);
1940
1941                         GEM_BUG_ON(!(rq->execution_mask & engine->mask));
1942                         rq->engine = engine;
1943
1944                         if (engine != ve->siblings[0]) {
1945                                 u32 *regs = ve->context.lrc_reg_state;
1946                                 unsigned int n;
1947
1948                                 GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1949
1950                                 if (!intel_engine_has_relative_mmio(engine))
1951                                         virtual_update_register_offsets(regs,
1952                                                                         engine);
1953
1954                                 if (!list_empty(&ve->context.signals))
1955                                         virtual_xfer_breadcrumbs(ve, engine);
1956
1957                                 /*
1958                                  * Move the bound engine to the top of the list
1959                                  * for future execution. We then kick this
1960                                  * tasklet first before checking others, so that
1961                                  * we preferentially reuse this set of bound
1962                                  * registers.
1963                                  */
1964                                 for (n = 1; n < ve->num_siblings; n++) {
1965                                         if (ve->siblings[n] == engine) {
1966                                                 swap(ve->siblings[n],
1967                                                      ve->siblings[0]);
1968                                                 break;
1969                                         }
1970                                 }
1971
1972                                 GEM_BUG_ON(ve->siblings[0] != engine);
1973                         }
1974
1975                         if (__i915_request_submit(rq)) {
1976                                 submit = true;
1977                                 last = rq;
1978                         }
1979                         i915_request_put(rq);
1980
1981                         /*
1982                          * Hmm, we have a bunch of virtual engine requests,
1983                          * but the first one was already completed (thanks
1984                          * preempt-to-busy!). Keep looking at the veng queue
1985                          * until we have no more relevant requests (i.e.
1986                          * the normal submit queue has higher priority).
1987                          */
1988                         if (!submit) {
1989                                 spin_unlock(&ve->base.active.lock);
1990                                 rb = rb_first_cached(&execlists->virtual);
1991                                 continue;
1992                         }
1993                 }
1994
1995                 spin_unlock(&ve->base.active.lock);
1996                 break;
1997         }
1998
1999         while ((rb = rb_first_cached(&execlists->queue))) {
2000                 struct i915_priolist *p = to_priolist(rb);
2001                 struct i915_request *rq, *rn;
2002                 int i;
2003
2004                 priolist_for_each_request_consume(rq, rn, p, i) {
2005                         bool merge = true;
2006
2007                         /*
2008                          * Can we combine this request with the current port?
2009                          * It has to be the same context/ringbuffer and not
2010                          * have any exceptions (e.g. GVT saying never to
2011                          * combine contexts).
2012                          *
2013                          * If we can combine the requests, we can execute both
2014                          * by updating the RING_TAIL to point to the end of the
2015                          * second request, and so we never need to tell the
2016                          * hardware about the first.
2017                          */
2018                         if (last && !can_merge_rq(last, rq)) {
2019                                 /*
2020                                  * If we are on the second port and cannot
2021                                  * combine this request with the last, then we
2022                                  * are done.
2023                                  */
2024                                 if (port == last_port)
2025                                         goto done;
2026
2027                                 /*
2028                                  * We must not populate both ELSP[] with the
2029                                  * same LRCA, i.e. we must submit 2 different
2030                                  * contexts if we submit 2 ELSP.
2031                                  */
2032                                 if (last->context == rq->context)
2033                                         goto done;
2034
2035                                 if (i915_request_has_sentinel(last))
2036                                         goto done;
2037
2038                                 /*
2039                                  * If GVT overrides us we only ever submit
2040                                  * port[0], leaving port[1] empty. Note that we
2041                                  * also have to be careful that we don't queue
2042                                  * the same context (even though a different
2043                                  * request) to the second port.
2044                                  */
2045                                 if (ctx_single_port_submission(last->context) ||
2046                                     ctx_single_port_submission(rq->context))
2047                                         goto done;
2048
2049                                 merge = false;
2050                         }
2051
2052                         if (__i915_request_submit(rq)) {
2053                                 if (!merge) {
2054                                         *port = execlists_schedule_in(last, port - execlists->pending);
2055                                         port++;
2056                                         last = NULL;
2057                                 }
2058
2059                                 GEM_BUG_ON(last &&
2060                                            !can_merge_ctx(last->context,
2061                                                           rq->context));
2062
2063                                 submit = true;
2064                                 last = rq;
2065                         }
2066                 }
2067
2068                 rb_erase_cached(&p->node, &execlists->queue);
2069                 i915_priolist_free(p);
2070         }
2071
2072 done:
2073         /*
2074          * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2075          *
2076          * We choose the priority hint such that if we add a request of greater
2077          * priority than this, we kick the submission tasklet to decide on
2078          * the right order of submitting the requests to hardware. We must
2079          * also be prepared to reorder requests as they are in-flight on the
2080          * HW. We derive the priority hint then as the first "hole" in
2081          * the HW submission ports and if there are no available slots,
2082          * the priority of the lowest executing request, i.e. last.
2083          *
2084          * When we do receive a higher priority request ready to run from the
2085          * user, see queue_request(), the priority hint is bumped to that
2086          * request triggering preemption on the next dequeue (or subsequent
2087          * interrupt for secondary ports).
2088          */
2089         execlists->queue_priority_hint = queue_prio(execlists);
2090
2091         if (submit) {
2092                 *port = execlists_schedule_in(last, port - execlists->pending);
2093                 execlists->switch_priority_hint =
2094                         switch_prio(engine, *execlists->pending);
2095
2096                 /*
2097                  * Skip if we ended up with exactly the same set of requests,
2098                  * e.g. trying to timeslice a pair of ordered contexts
2099                  */
2100                 if (!memcmp(execlists->active, execlists->pending,
2101                             (port - execlists->pending + 1) * sizeof(*port))) {
2102                         do
2103                                 execlists_schedule_out(fetch_and_zero(port));
2104                         while (port-- != execlists->pending);
2105
2106                         goto skip_submit;
2107                 }
2108                 clear_ports(port + 1, last_port - port);
2109
2110                 execlists_submit_ports(engine);
2111                 set_preempt_timeout(engine);
2112         } else {
2113 skip_submit:
2114                 ring_set_paused(engine, 0);
2115         }
2116 }
2117
2118 static void
2119 cancel_port_requests(struct intel_engine_execlists * const execlists)
2120 {
2121         struct i915_request * const *port;
2122
2123         for (port = execlists->pending; *port; port++)
2124                 execlists_schedule_out(*port);
2125         clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2126
2127         /* Mark the end of active before we overwrite *active */
2128         for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2129                 execlists_schedule_out(*port);
2130         clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2131
2132         WRITE_ONCE(execlists->active, execlists->inflight);
2133 }
2134
2135 static inline void
2136 invalidate_csb_entries(const u32 *first, const u32 *last)
2137 {
2138         clflush((void *)first);
2139         clflush((void *)last);
2140 }
2141
2142 static inline bool
2143 reset_in_progress(const struct intel_engine_execlists *execlists)
2144 {
2145         return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
2146 }
2147
2148 /*
2149  * Starting with Gen12, the status has a new format:
2150  *
2151  *     bit  0:     switched to new queue
2152  *     bit  1:     reserved
2153  *     bit  2:     semaphore wait mode (poll or signal), only valid when
2154  *                 switch detail is set to "wait on semaphore"
2155  *     bits 3-5:   engine class
2156  *     bits 6-11:  engine instance
2157  *     bits 12-14: reserved
2158  *     bits 15-25: sw context id of the lrc the GT switched to
2159  *     bits 26-31: sw counter of the lrc the GT switched to
2160  *     bits 32-35: context switch detail
2161  *                  - 0: ctx complete
2162  *                  - 1: wait on sync flip
2163  *                  - 2: wait on vblank
2164  *                  - 3: wait on scanline
2165  *                  - 4: wait on semaphore
2166  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2167  *                       WAIT_FOR_EVENT)
2168  *     bit  36:    reserved
2169  *     bits 37-43: wait detail (for switch detail 1 to 4)
2170  *     bits 44-46: reserved
2171  *     bits 47-57: sw context id of the lrc the GT switched away from
2172  *     bits 58-63: sw counter of the lrc the GT switched away from
2173  */
2174 static inline bool
2175 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2176 {
2177         u32 lower_dw = csb[0];
2178         u32 upper_dw = csb[1];
2179         bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
2180         bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
2181         bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2182
2183         /*
2184          * The context switch detail is not guaranteed to be 5 when a preemption
2185          * occurs, so we can't just check for that. The check below works for
2186          * all the cases we care about, including preemptions of WAIT
2187          * instructions and lite-restore. Preempt-to-idle via the CTRL register
2188          * would require some extra handling, but we don't support that.
2189          */
2190         if (!ctx_away_valid || new_queue) {
2191                 GEM_BUG_ON(!ctx_to_valid);
2192                 return true;
2193         }
2194
2195         /*
2196          * switch detail = 5 is covered by the case above and we do not expect a
2197          * context switch on an unsuccessful wait instruction since we always
2198          * use polling mode.
2199          */
2200         GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2201         return false;
2202 }
2203
2204 static inline bool
2205 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2206 {
2207         return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2208 }
2209
2210 static void process_csb(struct intel_engine_cs *engine)
2211 {
2212         struct intel_engine_execlists * const execlists = &engine->execlists;
2213         const u32 * const buf = execlists->csb_status;
2214         const u8 num_entries = execlists->csb_size;
2215         u8 head, tail;
2216
2217         /*
2218          * As we modify our execlists state tracking we require exclusive
2219          * access. Either we are inside the tasklet, or the tasklet is disabled
2220          * and we assume that is only inside the reset paths and so serialised.
2221          */
2222         GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2223                    !reset_in_progress(execlists));
2224         GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2225
2226         /*
2227          * Note that csb_write, csb_status may be either in HWSP or mmio.
2228          * When reading from the csb_write mmio register, we have to be
2229          * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2230          * the low 4bits. As it happens we know the next 4bits are always
2231          * zero and so we can simply masked off the low u8 of the register
2232          * and treat it identically to reading from the HWSP (without having
2233          * to use explicit shifting and masking, and probably bifurcating
2234          * the code to handle the legacy mmio read).
2235          */
2236         head = execlists->csb_head;
2237         tail = READ_ONCE(*execlists->csb_write);
2238         ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2239         if (unlikely(head == tail))
2240                 return;
2241
2242         /*
2243          * Hopefully paired with a wmb() in HW!
2244          *
2245          * We must complete the read of the write pointer before any reads
2246          * from the CSB, so that we do not see stale values. Without an rmb
2247          * (lfence) the HW may speculatively perform the CSB[] reads *before*
2248          * we perform the READ_ONCE(*csb_write).
2249          */
2250         rmb();
2251
2252         do {
2253                 bool promote;
2254
2255                 if (++head == num_entries)
2256                         head = 0;
2257
2258                 /*
2259                  * We are flying near dragons again.
2260                  *
2261                  * We hold a reference to the request in execlist_port[]
2262                  * but no more than that. We are operating in softirq
2263                  * context and so cannot hold any mutex or sleep. That
2264                  * prevents us stopping the requests we are processing
2265                  * in port[] from being retired simultaneously (the
2266                  * breadcrumb will be complete before we see the
2267                  * context-switch). As we only hold the reference to the
2268                  * request, any pointer chasing underneath the request
2269                  * is subject to a potential use-after-free. Thus we
2270                  * store all of the bookkeeping within port[] as
2271                  * required, and avoid using unguarded pointers beneath
2272                  * request itself. The same applies to the atomic
2273                  * status notifier.
2274                  */
2275
2276                 ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2277                              head, buf[2 * head + 0], buf[2 * head + 1]);
2278
2279                 if (INTEL_GEN(engine->i915) >= 12)
2280                         promote = gen12_csb_parse(execlists, buf + 2 * head);
2281                 else
2282                         promote = gen8_csb_parse(execlists, buf + 2 * head);
2283                 if (promote) {
2284                         struct i915_request * const *old = execlists->active;
2285
2286                         /* Point active to the new ELSP; prevent overwriting */
2287                         WRITE_ONCE(execlists->active, execlists->pending);
2288
2289                         if (!inject_preempt_hang(execlists))
2290                                 ring_set_paused(engine, 0);
2291
2292                         /* cancel old inflight, prepare for switch */
2293                         trace_ports(execlists, "preempted", old);
2294                         while (*old)
2295                                 execlists_schedule_out(*old++);
2296
2297                         /* switch pending to inflight */
2298                         GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2299                         WRITE_ONCE(execlists->active,
2300                                    memcpy(execlists->inflight,
2301                                           execlists->pending,
2302                                           execlists_num_ports(execlists) *
2303                                           sizeof(*execlists->pending)));
2304
2305                         WRITE_ONCE(execlists->pending[0], NULL);
2306                 } else {
2307                         GEM_BUG_ON(!*execlists->active);
2308
2309                         /* port0 completed, advanced to port1 */
2310                         trace_ports(execlists, "completed", execlists->active);
2311
2312                         /*
2313                          * We rely on the hardware being strongly
2314                          * ordered, that the breadcrumb write is
2315                          * coherent (visible from the CPU) before the
2316                          * user interrupt and CSB is processed.
2317                          */
2318                         GEM_BUG_ON(!i915_request_completed(*execlists->active) &&
2319                                    !reset_in_progress(execlists));
2320                         execlists_schedule_out(*execlists->active++);
2321
2322                         GEM_BUG_ON(execlists->active - execlists->inflight >
2323                                    execlists_num_ports(execlists));
2324                 }
2325         } while (head != tail);
2326
2327         execlists->csb_head = head;
2328         set_timeslice(engine);
2329
2330         /*
2331          * Gen11 has proven to fail wrt global observation point between
2332          * entry and tail update, failing on the ordering and thus
2333          * we see an old entry in the context status buffer.
2334          *
2335          * Forcibly evict out entries for the next gpu csb update,
2336          * to increase the odds that we get a fresh entries with non
2337          * working hardware. The cost for doing so comes out mostly with
2338          * the wash as hardware, working or not, will need to do the
2339          * invalidation before.
2340          */
2341         invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2342 }
2343
2344 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2345 {
2346         lockdep_assert_held(&engine->active.lock);
2347         if (!engine->execlists.pending[0]) {
2348                 rcu_read_lock(); /* protect peeking at execlists->active */
2349                 execlists_dequeue(engine);
2350                 rcu_read_unlock();
2351         }
2352 }
2353
2354 static noinline void preempt_reset(struct intel_engine_cs *engine)
2355 {
2356         const unsigned int bit = I915_RESET_ENGINE + engine->id;
2357         unsigned long *lock = &engine->gt->reset.flags;
2358
2359         if (i915_modparams.reset < 3)
2360                 return;
2361
2362         if (test_and_set_bit(bit, lock))
2363                 return;
2364
2365         /* Mark this tasklet as disabled to avoid waiting for it to complete */
2366         tasklet_disable_nosync(&engine->execlists.tasklet);
2367
2368         ENGINE_TRACE(engine, "preempt timeout %lu+%ums\n",
2369                      READ_ONCE(engine->props.preempt_timeout_ms),
2370                      jiffies_to_msecs(jiffies - engine->execlists.preempt.expires));
2371         intel_engine_reset(engine, "preemption time out");
2372
2373         tasklet_enable(&engine->execlists.tasklet);
2374         clear_and_wake_up_bit(bit, lock);
2375 }
2376
2377 static bool preempt_timeout(const struct intel_engine_cs *const engine)
2378 {
2379         const struct timer_list *t = &engine->execlists.preempt;
2380
2381         if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
2382                 return false;
2383
2384         if (!timer_expired(t))
2385                 return false;
2386
2387         return READ_ONCE(engine->execlists.pending[0]);
2388 }
2389
2390 /*
2391  * Check the unread Context Status Buffers and manage the submission of new
2392  * contexts to the ELSP accordingly.
2393  */
2394 static void execlists_submission_tasklet(unsigned long data)
2395 {
2396         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
2397         bool timeout = preempt_timeout(engine);
2398
2399         process_csb(engine);
2400         if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
2401                 unsigned long flags;
2402
2403                 spin_lock_irqsave(&engine->active.lock, flags);
2404                 __execlists_submission_tasklet(engine);
2405                 spin_unlock_irqrestore(&engine->active.lock, flags);
2406
2407                 /* Recheck after serialising with direct-submission */
2408                 if (timeout && preempt_timeout(engine))
2409                         preempt_reset(engine);
2410         }
2411 }
2412
2413 static void __execlists_kick(struct intel_engine_execlists *execlists)
2414 {
2415         /* Kick the tasklet for some interrupt coalescing and reset handling */
2416         tasklet_hi_schedule(&execlists->tasklet);
2417 }
2418
2419 #define execlists_kick(t, member) \
2420         __execlists_kick(container_of(t, struct intel_engine_execlists, member))
2421
2422 static void execlists_timeslice(struct timer_list *timer)
2423 {
2424         execlists_kick(timer, timer);
2425 }
2426
2427 static void execlists_preempt(struct timer_list *timer)
2428 {
2429         execlists_kick(timer, preempt);
2430 }
2431
2432 static void queue_request(struct intel_engine_cs *engine,
2433                           struct i915_sched_node *node,
2434                           int prio)
2435 {
2436         GEM_BUG_ON(!list_empty(&node->link));
2437         list_add_tail(&node->link, i915_sched_lookup_priolist(engine, prio));
2438 }
2439
2440 static void __submit_queue_imm(struct intel_engine_cs *engine)
2441 {
2442         struct intel_engine_execlists * const execlists = &engine->execlists;
2443
2444         if (reset_in_progress(execlists))
2445                 return; /* defer until we restart the engine following reset */
2446
2447         if (execlists->tasklet.func == execlists_submission_tasklet)
2448                 __execlists_submission_tasklet(engine);
2449         else
2450                 tasklet_hi_schedule(&execlists->tasklet);
2451 }
2452
2453 static void submit_queue(struct intel_engine_cs *engine,
2454                          const struct i915_request *rq)
2455 {
2456         struct intel_engine_execlists *execlists = &engine->execlists;
2457
2458         if (rq_prio(rq) <= execlists->queue_priority_hint)
2459                 return;
2460
2461         execlists->queue_priority_hint = rq_prio(rq);
2462         __submit_queue_imm(engine);
2463 }
2464
2465 static void execlists_submit_request(struct i915_request *request)
2466 {
2467         struct intel_engine_cs *engine = request->engine;
2468         unsigned long flags;
2469
2470         /* Will be called from irq-context when using foreign fences. */
2471         spin_lock_irqsave(&engine->active.lock, flags);
2472
2473         queue_request(engine, &request->sched, rq_prio(request));
2474
2475         GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
2476         GEM_BUG_ON(list_empty(&request->sched.link));
2477
2478         submit_queue(engine, request);
2479
2480         spin_unlock_irqrestore(&engine->active.lock, flags);
2481 }
2482
2483 static void __execlists_context_fini(struct intel_context *ce)
2484 {
2485         intel_ring_put(ce->ring);
2486         i915_vma_put(ce->state);
2487 }
2488
2489 static void execlists_context_destroy(struct kref *kref)
2490 {
2491         struct intel_context *ce = container_of(kref, typeof(*ce), ref);
2492
2493         GEM_BUG_ON(!i915_active_is_idle(&ce->active));
2494         GEM_BUG_ON(intel_context_is_pinned(ce));
2495
2496         if (ce->state)
2497                 __execlists_context_fini(ce);
2498
2499         intel_context_fini(ce);
2500         intel_context_free(ce);
2501 }
2502
2503 static void
2504 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
2505 {
2506         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2507                 return;
2508
2509         vaddr += engine->context_size;
2510
2511         memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
2512 }
2513
2514 static void
2515 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
2516 {
2517         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2518                 return;
2519
2520         vaddr += engine->context_size;
2521
2522         if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
2523                 dev_err_once(engine->i915->drm.dev,
2524                              "%s context redzone overwritten!\n",
2525                              engine->name);
2526 }
2527
2528 static void execlists_context_unpin(struct intel_context *ce)
2529 {
2530         check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
2531                       ce->engine);
2532
2533         i915_gem_object_unpin_map(ce->state->obj);
2534         intel_ring_reset(ce->ring, ce->ring->tail);
2535 }
2536
2537 static void
2538 __execlists_update_reg_state(const struct intel_context *ce,
2539                              const struct intel_engine_cs *engine)
2540 {
2541         struct intel_ring *ring = ce->ring;
2542         u32 *regs = ce->lrc_reg_state;
2543
2544         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head));
2545         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
2546
2547         regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
2548         regs[CTX_RING_HEAD] = ring->head;
2549         regs[CTX_RING_TAIL] = ring->tail;
2550
2551         /* RPCS */
2552         if (engine->class == RENDER_CLASS) {
2553                 regs[CTX_R_PWR_CLK_STATE] =
2554                         intel_sseu_make_rpcs(engine->i915, &ce->sseu);
2555
2556                 i915_oa_init_reg_state(ce, engine);
2557         }
2558 }
2559
2560 static int
2561 __execlists_context_pin(struct intel_context *ce,
2562                         struct intel_engine_cs *engine)
2563 {
2564         void *vaddr;
2565
2566         GEM_BUG_ON(!ce->state);
2567         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
2568
2569         vaddr = i915_gem_object_pin_map(ce->state->obj,
2570                                         i915_coherent_map_type(engine->i915) |
2571                                         I915_MAP_OVERRIDE);
2572         if (IS_ERR(vaddr))
2573                 return PTR_ERR(vaddr);
2574
2575         ce->lrc_desc = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
2576         ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
2577         __execlists_update_reg_state(ce, engine);
2578
2579         return 0;
2580 }
2581
2582 static int execlists_context_pin(struct intel_context *ce)
2583 {
2584         return __execlists_context_pin(ce, ce->engine);
2585 }
2586
2587 static int execlists_context_alloc(struct intel_context *ce)
2588 {
2589         return __execlists_context_alloc(ce, ce->engine);
2590 }
2591
2592 static void execlists_context_reset(struct intel_context *ce)
2593 {
2594         CE_TRACE(ce, "reset\n");
2595         GEM_BUG_ON(!intel_context_is_pinned(ce));
2596
2597         /*
2598          * Because we emit WA_TAIL_DWORDS there may be a disparity
2599          * between our bookkeeping in ce->ring->head and ce->ring->tail and
2600          * that stored in context. As we only write new commands from
2601          * ce->ring->tail onwards, everything before that is junk. If the GPU
2602          * starts reading from its RING_HEAD from the context, it may try to
2603          * execute that junk and die.
2604          *
2605          * The contexts that are stilled pinned on resume belong to the
2606          * kernel, and are local to each engine. All other contexts will
2607          * have their head/tail sanitized upon pinning before use, so they
2608          * will never see garbage,
2609          *
2610          * So to avoid that we reset the context images upon resume. For
2611          * simplicity, we just zero everything out.
2612          */
2613         intel_ring_reset(ce->ring, ce->ring->emit);
2614
2615         /* Scrub away the garbage */
2616         execlists_init_reg_state(ce->lrc_reg_state,
2617                                  ce, ce->engine, ce->ring, true);
2618         __execlists_update_reg_state(ce, ce->engine);
2619
2620         ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
2621 }
2622
2623 static const struct intel_context_ops execlists_context_ops = {
2624         .alloc = execlists_context_alloc,
2625
2626         .pin = execlists_context_pin,
2627         .unpin = execlists_context_unpin,
2628
2629         .enter = intel_context_enter_engine,
2630         .exit = intel_context_exit_engine,
2631
2632         .reset = execlists_context_reset,
2633         .destroy = execlists_context_destroy,
2634 };
2635
2636 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
2637 {
2638         u32 *cs;
2639
2640         GEM_BUG_ON(!i915_request_timeline(rq)->has_initial_breadcrumb);
2641
2642         cs = intel_ring_begin(rq, 6);
2643         if (IS_ERR(cs))
2644                 return PTR_ERR(cs);
2645
2646         /*
2647          * Check if we have been preempted before we even get started.
2648          *
2649          * After this point i915_request_started() reports true, even if
2650          * we get preempted and so are no longer running.
2651          */
2652         *cs++ = MI_ARB_CHECK;
2653         *cs++ = MI_NOOP;
2654
2655         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
2656         *cs++ = i915_request_timeline(rq)->hwsp_offset;
2657         *cs++ = 0;
2658         *cs++ = rq->fence.seqno - 1;
2659
2660         intel_ring_advance(rq, cs);
2661
2662         /* Record the updated position of the request's payload */
2663         rq->infix = intel_ring_offset(rq, cs);
2664
2665         return 0;
2666 }
2667
2668 static int execlists_request_alloc(struct i915_request *request)
2669 {
2670         int ret;
2671
2672         GEM_BUG_ON(!intel_context_is_pinned(request->context));
2673
2674         /*
2675          * Flush enough space to reduce the likelihood of waiting after
2676          * we start building the request - in which case we will just
2677          * have to repeat work.
2678          */
2679         request->reserved_space += EXECLISTS_REQUEST_SIZE;
2680
2681         /*
2682          * Note that after this point, we have committed to using
2683          * this request as it is being used to both track the
2684          * state of engine initialisation and liveness of the
2685          * golden renderstate above. Think twice before you try
2686          * to cancel/unwind this request now.
2687          */
2688
2689         /* Unconditionally invalidate GPU caches and TLBs. */
2690         ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
2691         if (ret)
2692                 return ret;
2693
2694         request->reserved_space -= EXECLISTS_REQUEST_SIZE;
2695         return 0;
2696 }
2697
2698 /*
2699  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
2700  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
2701  * but there is a slight complication as this is applied in WA batch where the
2702  * values are only initialized once so we cannot take register value at the
2703  * beginning and reuse it further; hence we save its value to memory, upload a
2704  * constant value with bit21 set and then we restore it back with the saved value.
2705  * To simplify the WA, a constant value is formed by using the default value
2706  * of this register. This shouldn't be a problem because we are only modifying
2707  * it for a short period and this batch in non-premptible. We can ofcourse
2708  * use additional instructions that read the actual value of the register
2709  * at that time and set our bit of interest but it makes the WA complicated.
2710  *
2711  * This WA is also required for Gen9 so extracting as a function avoids
2712  * code duplication.
2713  */
2714 static u32 *
2715 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
2716 {
2717         /* NB no one else is allowed to scribble over scratch + 256! */
2718         *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
2719         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2720         *batch++ = intel_gt_scratch_offset(engine->gt,
2721                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
2722         *batch++ = 0;
2723
2724         *batch++ = MI_LOAD_REGISTER_IMM(1);
2725         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2726         *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
2727
2728         batch = gen8_emit_pipe_control(batch,
2729                                        PIPE_CONTROL_CS_STALL |
2730                                        PIPE_CONTROL_DC_FLUSH_ENABLE,
2731                                        0);
2732
2733         *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
2734         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2735         *batch++ = intel_gt_scratch_offset(engine->gt,
2736                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
2737         *batch++ = 0;
2738
2739         return batch;
2740 }
2741
2742 /*
2743  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
2744  * initialized at the beginning and shared across all contexts but this field
2745  * helps us to have multiple batches at different offsets and select them based
2746  * on a criteria. At the moment this batch always start at the beginning of the page
2747  * and at this point we don't have multiple wa_ctx batch buffers.
2748  *
2749  * The number of WA applied are not known at the beginning; we use this field
2750  * to return the no of DWORDS written.
2751  *
2752  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
2753  * so it adds NOOPs as padding to make it cacheline aligned.
2754  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
2755  * makes a complete batch buffer.
2756  */
2757 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2758 {
2759         /* WaDisableCtxRestoreArbitration:bdw,chv */
2760         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2761
2762         /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
2763         if (IS_BROADWELL(engine->i915))
2764                 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
2765
2766         /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
2767         /* Actual scratch location is at 128 bytes offset */
2768         batch = gen8_emit_pipe_control(batch,
2769                                        PIPE_CONTROL_FLUSH_L3 |
2770                                        PIPE_CONTROL_STORE_DATA_INDEX |
2771                                        PIPE_CONTROL_CS_STALL |
2772                                        PIPE_CONTROL_QW_WRITE,
2773                                        LRC_PPHWSP_SCRATCH_ADDR);
2774
2775         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2776
2777         /* Pad to end of cacheline */
2778         while ((unsigned long)batch % CACHELINE_BYTES)
2779                 *batch++ = MI_NOOP;
2780
2781         /*
2782          * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
2783          * execution depends on the length specified in terms of cache lines
2784          * in the register CTX_RCS_INDIRECT_CTX
2785          */
2786
2787         return batch;
2788 }
2789
2790 struct lri {
2791         i915_reg_t reg;
2792         u32 value;
2793 };
2794
2795 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
2796 {
2797         GEM_BUG_ON(!count || count > 63);
2798
2799         *batch++ = MI_LOAD_REGISTER_IMM(count);
2800         do {
2801                 *batch++ = i915_mmio_reg_offset(lri->reg);
2802                 *batch++ = lri->value;
2803         } while (lri++, --count);
2804         *batch++ = MI_NOOP;
2805
2806         return batch;
2807 }
2808
2809 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2810 {
2811         static const struct lri lri[] = {
2812                 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
2813                 {
2814                         COMMON_SLICE_CHICKEN2,
2815                         __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
2816                                        0),
2817                 },
2818
2819                 /* BSpec: 11391 */
2820                 {
2821                         FF_SLICE_CHICKEN,
2822                         __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
2823                                        FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
2824                 },
2825
2826                 /* BSpec: 11299 */
2827                 {
2828                         _3D_CHICKEN3,
2829                         __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
2830                                        _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
2831                 }
2832         };
2833
2834         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2835
2836         /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
2837         batch = gen8_emit_flush_coherentl3_wa(engine, batch);
2838
2839         /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
2840         batch = gen8_emit_pipe_control(batch,
2841                                        PIPE_CONTROL_FLUSH_L3 |
2842                                        PIPE_CONTROL_STORE_DATA_INDEX |
2843                                        PIPE_CONTROL_CS_STALL |
2844                                        PIPE_CONTROL_QW_WRITE,
2845                                        LRC_PPHWSP_SCRATCH_ADDR);
2846
2847         batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
2848
2849         /* WaMediaPoolStateCmdInWABB:bxt,glk */
2850         if (HAS_POOLED_EU(engine->i915)) {
2851                 /*
2852                  * EU pool configuration is setup along with golden context
2853                  * during context initialization. This value depends on
2854                  * device type (2x6 or 3x6) and needs to be updated based
2855                  * on which subslice is disabled especially for 2x6
2856                  * devices, however it is safe to load default
2857                  * configuration of 3x6 device instead of masking off
2858                  * corresponding bits because HW ignores bits of a disabled
2859                  * subslice and drops down to appropriate config. Please
2860                  * see render_state_setup() in i915_gem_render_state.c for
2861                  * possible configurations, to avoid duplication they are
2862                  * not shown here again.
2863                  */
2864                 *batch++ = GEN9_MEDIA_POOL_STATE;
2865                 *batch++ = GEN9_MEDIA_POOL_ENABLE;
2866                 *batch++ = 0x00777000;
2867                 *batch++ = 0;
2868                 *batch++ = 0;
2869                 *batch++ = 0;
2870         }
2871
2872         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2873
2874         /* Pad to end of cacheline */
2875         while ((unsigned long)batch % CACHELINE_BYTES)
2876                 *batch++ = MI_NOOP;
2877
2878         return batch;
2879 }
2880
2881 static u32 *
2882 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2883 {
2884         int i;
2885
2886         /*
2887          * WaPipeControlBefore3DStateSamplePattern: cnl
2888          *
2889          * Ensure the engine is idle prior to programming a
2890          * 3DSTATE_SAMPLE_PATTERN during a context restore.
2891          */
2892         batch = gen8_emit_pipe_control(batch,
2893                                        PIPE_CONTROL_CS_STALL,
2894                                        0);
2895         /*
2896          * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
2897          * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
2898          * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
2899          * confusing. Since gen8_emit_pipe_control() already advances the
2900          * batch by 6 dwords, we advance the other 10 here, completing a
2901          * cacheline. It's not clear if the workaround requires this padding
2902          * before other commands, or if it's just the regular padding we would
2903          * already have for the workaround bb, so leave it here for now.
2904          */
2905         for (i = 0; i < 10; i++)
2906                 *batch++ = MI_NOOP;
2907
2908         /* Pad to end of cacheline */
2909         while ((unsigned long)batch % CACHELINE_BYTES)
2910                 *batch++ = MI_NOOP;
2911
2912         return batch;
2913 }
2914
2915 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
2916
2917 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
2918 {
2919         struct drm_i915_gem_object *obj;
2920         struct i915_vma *vma;
2921         int err;
2922
2923         obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
2924         if (IS_ERR(obj))
2925                 return PTR_ERR(obj);
2926
2927         vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
2928         if (IS_ERR(vma)) {
2929                 err = PTR_ERR(vma);
2930                 goto err;
2931         }
2932
2933         err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
2934         if (err)
2935                 goto err;
2936
2937         engine->wa_ctx.vma = vma;
2938         return 0;
2939
2940 err:
2941         i915_gem_object_put(obj);
2942         return err;
2943 }
2944
2945 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
2946 {
2947         i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
2948 }
2949
2950 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
2951
2952 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
2953 {
2954         struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
2955         struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
2956                                             &wa_ctx->per_ctx };
2957         wa_bb_func_t wa_bb_fn[2];
2958         struct page *page;
2959         void *batch, *batch_ptr;
2960         unsigned int i;
2961         int ret;
2962
2963         if (engine->class != RENDER_CLASS)
2964                 return 0;
2965
2966         switch (INTEL_GEN(engine->i915)) {
2967         case 12:
2968         case 11:
2969                 return 0;
2970         case 10:
2971                 wa_bb_fn[0] = gen10_init_indirectctx_bb;
2972                 wa_bb_fn[1] = NULL;
2973                 break;
2974         case 9:
2975                 wa_bb_fn[0] = gen9_init_indirectctx_bb;
2976                 wa_bb_fn[1] = NULL;
2977                 break;
2978         case 8:
2979                 wa_bb_fn[0] = gen8_init_indirectctx_bb;
2980                 wa_bb_fn[1] = NULL;
2981                 break;
2982         default:
2983                 MISSING_CASE(INTEL_GEN(engine->i915));
2984                 return 0;
2985         }
2986
2987         ret = lrc_setup_wa_ctx(engine);
2988         if (ret) {
2989                 DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
2990                 return ret;
2991         }
2992
2993         page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
2994         batch = batch_ptr = kmap_atomic(page);
2995
2996         /*
2997          * Emit the two workaround batch buffers, recording the offset from the
2998          * start of the workaround batch buffer object for each and their
2999          * respective sizes.
3000          */
3001         for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3002                 wa_bb[i]->offset = batch_ptr - batch;
3003                 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3004                                                   CACHELINE_BYTES))) {
3005                         ret = -EINVAL;
3006                         break;
3007                 }
3008                 if (wa_bb_fn[i])
3009                         batch_ptr = wa_bb_fn[i](engine, batch_ptr);
3010                 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
3011         }
3012
3013         BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
3014
3015         kunmap_atomic(batch);
3016         if (ret)
3017                 lrc_destroy_wa_ctx(engine);
3018
3019         return ret;
3020 }
3021
3022 static void enable_execlists(struct intel_engine_cs *engine)
3023 {
3024         u32 mode;
3025
3026         assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
3027
3028         intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
3029
3030         if (INTEL_GEN(engine->i915) >= 11)
3031                 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
3032         else
3033                 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
3034         ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
3035
3036         ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
3037
3038         ENGINE_WRITE_FW(engine,
3039                         RING_HWS_PGA,
3040                         i915_ggtt_offset(engine->status_page.vma));
3041         ENGINE_POSTING_READ(engine, RING_HWS_PGA);
3042
3043         engine->context_tag = 0;
3044 }
3045
3046 static bool unexpected_starting_state(struct intel_engine_cs *engine)
3047 {
3048         bool unexpected = false;
3049
3050         if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
3051                 DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
3052                 unexpected = true;
3053         }
3054
3055         return unexpected;
3056 }
3057
3058 static int execlists_resume(struct intel_engine_cs *engine)
3059 {
3060         intel_engine_apply_workarounds(engine);
3061         intel_engine_apply_whitelist(engine);
3062
3063         intel_mocs_init_engine(engine);
3064
3065         intel_engine_reset_breadcrumbs(engine);
3066
3067         if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
3068                 struct drm_printer p = drm_debug_printer(__func__);
3069
3070                 intel_engine_dump(engine, &p, NULL);
3071         }
3072
3073         enable_execlists(engine);
3074
3075         return 0;
3076 }
3077
3078 static void execlists_reset_prepare(struct intel_engine_cs *engine)
3079 {
3080         struct intel_engine_execlists * const execlists = &engine->execlists;
3081         unsigned long flags;
3082
3083         ENGINE_TRACE(engine, "depth<-%d\n",
3084                      atomic_read(&execlists->tasklet.count));
3085
3086         /*
3087          * Prevent request submission to the hardware until we have
3088          * completed the reset in i915_gem_reset_finish(). If a request
3089          * is completed by one engine, it may then queue a request
3090          * to a second via its execlists->tasklet *just* as we are
3091          * calling engine->resume() and also writing the ELSP.
3092          * Turning off the execlists->tasklet until the reset is over
3093          * prevents the race.
3094          */
3095         __tasklet_disable_sync_once(&execlists->tasklet);
3096         GEM_BUG_ON(!reset_in_progress(execlists));
3097
3098         /* And flush any current direct submission. */
3099         spin_lock_irqsave(&engine->active.lock, flags);
3100         spin_unlock_irqrestore(&engine->active.lock, flags);
3101
3102         /*
3103          * We stop engines, otherwise we might get failed reset and a
3104          * dead gpu (on elk). Also as modern gpu as kbl can suffer
3105          * from system hang if batchbuffer is progressing when
3106          * the reset is issued, regardless of READY_TO_RESET ack.
3107          * Thus assume it is best to stop engines on all gens
3108          * where we have a gpu reset.
3109          *
3110          * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
3111          *
3112          * FIXME: Wa for more modern gens needs to be validated
3113          */
3114         intel_engine_stop_cs(engine);
3115 }
3116
3117 static void reset_csb_pointers(struct intel_engine_cs *engine)
3118 {
3119         struct intel_engine_execlists * const execlists = &engine->execlists;
3120         const unsigned int reset_value = execlists->csb_size - 1;
3121
3122         ring_set_paused(engine, 0);
3123
3124         /*
3125          * After a reset, the HW starts writing into CSB entry [0]. We
3126          * therefore have to set our HEAD pointer back one entry so that
3127          * the *first* entry we check is entry 0. To complicate this further,
3128          * as we don't wait for the first interrupt after reset, we have to
3129          * fake the HW write to point back to the last entry so that our
3130          * inline comparison of our cached head position against the last HW
3131          * write works even before the first interrupt.
3132          */
3133         execlists->csb_head = reset_value;
3134         WRITE_ONCE(*execlists->csb_write, reset_value);
3135         wmb(); /* Make sure this is visible to HW (paranoia?) */
3136
3137         /*
3138          * Sometimes Icelake forgets to reset its pointers on a GPU reset.
3139          * Bludgeon them with a mmio update to be sure.
3140          */
3141         ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
3142                      reset_value << 8 | reset_value);
3143         ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
3144
3145         invalidate_csb_entries(&execlists->csb_status[0],
3146                                &execlists->csb_status[reset_value]);
3147 }
3148
3149 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
3150 {
3151         int x;
3152
3153         x = lrc_ring_mi_mode(engine);
3154         if (x != -1) {
3155                 regs[x + 1] &= ~STOP_RING;
3156                 regs[x + 1] |= STOP_RING << 16;
3157         }
3158 }
3159
3160 static void __execlists_reset_reg_state(const struct intel_context *ce,
3161                                         const struct intel_engine_cs *engine)
3162 {
3163         u32 *regs = ce->lrc_reg_state;
3164
3165         __reset_stop_ring(regs, engine);
3166 }
3167
3168 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
3169 {
3170         struct intel_engine_execlists * const execlists = &engine->execlists;
3171         struct intel_context *ce;
3172         struct i915_request *rq;
3173
3174         mb(); /* paranoia: read the CSB pointers from after the reset */
3175         clflush(execlists->csb_write);
3176         mb();
3177
3178         process_csb(engine); /* drain preemption events */
3179
3180         /* Following the reset, we need to reload the CSB read/write pointers */
3181         reset_csb_pointers(engine);
3182
3183         /*
3184          * Save the currently executing context, even if we completed
3185          * its request, it was still running at the time of the
3186          * reset and will have been clobbered.
3187          */
3188         rq = execlists_active(execlists);
3189         if (!rq)
3190                 goto unwind;
3191
3192         /* We still have requests in-flight; the engine should be active */
3193         GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
3194
3195         ce = rq->context;
3196         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3197
3198         if (i915_request_completed(rq)) {
3199                 /* Idle context; tidy up the ring so we can restart afresh */
3200                 ce->ring->head = intel_ring_wrap(ce->ring, rq->tail);
3201                 goto out_replay;
3202         }
3203
3204         /* Context has requests still in-flight; it should not be idle! */
3205         GEM_BUG_ON(i915_active_is_idle(&ce->active));
3206         rq = active_request(ce->timeline, rq);
3207         ce->ring->head = intel_ring_wrap(ce->ring, rq->head);
3208         GEM_BUG_ON(ce->ring->head == ce->ring->tail);
3209
3210         /*
3211          * If this request hasn't started yet, e.g. it is waiting on a
3212          * semaphore, we need to avoid skipping the request or else we
3213          * break the signaling chain. However, if the context is corrupt
3214          * the request will not restart and we will be stuck with a wedged
3215          * device. It is quite often the case that if we issue a reset
3216          * while the GPU is loading the context image, that the context
3217          * image becomes corrupt.
3218          *
3219          * Otherwise, if we have not started yet, the request should replay
3220          * perfectly and we do not need to flag the result as being erroneous.
3221          */
3222         if (!i915_request_started(rq))
3223                 goto out_replay;
3224
3225         /*
3226          * If the request was innocent, we leave the request in the ELSP
3227          * and will try to replay it on restarting. The context image may
3228          * have been corrupted by the reset, in which case we may have
3229          * to service a new GPU hang, but more likely we can continue on
3230          * without impact.
3231          *
3232          * If the request was guilty, we presume the context is corrupt
3233          * and have to at least restore the RING register in the context
3234          * image back to the expected values to skip over the guilty request.
3235          */
3236         __i915_request_reset(rq, stalled);
3237         if (!stalled)
3238                 goto out_replay;
3239
3240         /*
3241          * We want a simple context + ring to execute the breadcrumb update.
3242          * We cannot rely on the context being intact across the GPU hang,
3243          * so clear it and rebuild just what we need for the breadcrumb.
3244          * All pending requests for this context will be zapped, and any
3245          * future request will be after userspace has had the opportunity
3246          * to recreate its own state.
3247          */
3248         GEM_BUG_ON(!intel_context_is_pinned(ce));
3249         restore_default_state(ce, engine);
3250
3251 out_replay:
3252         ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
3253                      ce->ring->head, ce->ring->tail);
3254         intel_ring_update_space(ce->ring);
3255         __execlists_reset_reg_state(ce, engine);
3256         __execlists_update_reg_state(ce, engine);
3257         ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
3258
3259 unwind:
3260         /* Push back any incomplete requests for replay after the reset. */
3261         cancel_port_requests(execlists);
3262         __unwind_incomplete_requests(engine);
3263 }
3264
3265 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
3266 {
3267         unsigned long flags;
3268
3269         ENGINE_TRACE(engine, "\n");
3270
3271         spin_lock_irqsave(&engine->active.lock, flags);
3272
3273         __execlists_reset(engine, stalled);
3274
3275         spin_unlock_irqrestore(&engine->active.lock, flags);
3276 }
3277
3278 static void nop_submission_tasklet(unsigned long data)
3279 {
3280         /* The driver is wedged; don't process any more events. */
3281 }
3282
3283 static void execlists_reset_cancel(struct intel_engine_cs *engine)
3284 {
3285         struct intel_engine_execlists * const execlists = &engine->execlists;
3286         struct i915_request *rq, *rn;
3287         struct rb_node *rb;
3288         unsigned long flags;
3289
3290         ENGINE_TRACE(engine, "\n");
3291
3292         /*
3293          * Before we call engine->cancel_requests(), we should have exclusive
3294          * access to the submission state. This is arranged for us by the
3295          * caller disabling the interrupt generation, the tasklet and other
3296          * threads that may then access the same state, giving us a free hand
3297          * to reset state. However, we still need to let lockdep be aware that
3298          * we know this state may be accessed in hardirq context, so we
3299          * disable the irq around this manipulation and we want to keep
3300          * the spinlock focused on its duties and not accidentally conflate
3301          * coverage to the submission's irq state. (Similarly, although we
3302          * shouldn't need to disable irq around the manipulation of the
3303          * submission's irq state, we also wish to remind ourselves that
3304          * it is irq state.)
3305          */
3306         spin_lock_irqsave(&engine->active.lock, flags);
3307
3308         __execlists_reset(engine, true);
3309
3310         /* Mark all executing requests as skipped. */
3311         list_for_each_entry(rq, &engine->active.requests, sched.link)
3312                 mark_eio(rq);
3313
3314         /* Flush the queued requests to the timeline list (for retiring). */
3315         while ((rb = rb_first_cached(&execlists->queue))) {
3316                 struct i915_priolist *p = to_priolist(rb);
3317                 int i;
3318
3319                 priolist_for_each_request_consume(rq, rn, p, i) {
3320                         mark_eio(rq);
3321                         __i915_request_submit(rq);
3322                 }
3323
3324                 rb_erase_cached(&p->node, &execlists->queue);
3325                 i915_priolist_free(p);
3326         }
3327
3328         /* Cancel all attached virtual engines */
3329         while ((rb = rb_first_cached(&execlists->virtual))) {
3330                 struct virtual_engine *ve =
3331                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
3332
3333                 rb_erase_cached(rb, &execlists->virtual);
3334                 RB_CLEAR_NODE(rb);
3335
3336                 spin_lock(&ve->base.active.lock);
3337                 rq = fetch_and_zero(&ve->request);
3338                 if (rq) {
3339                         mark_eio(rq);
3340
3341                         rq->engine = engine;
3342                         __i915_request_submit(rq);
3343                         i915_request_put(rq);
3344
3345                         ve->base.execlists.queue_priority_hint = INT_MIN;
3346                 }
3347                 spin_unlock(&ve->base.active.lock);
3348         }
3349
3350         /* Remaining _unready_ requests will be nop'ed when submitted */
3351
3352         execlists->queue_priority_hint = INT_MIN;
3353         execlists->queue = RB_ROOT_CACHED;
3354
3355         GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
3356         execlists->tasklet.func = nop_submission_tasklet;
3357
3358         spin_unlock_irqrestore(&engine->active.lock, flags);
3359 }
3360
3361 static void execlists_reset_finish(struct intel_engine_cs *engine)
3362 {
3363         struct intel_engine_execlists * const execlists = &engine->execlists;
3364
3365         /*
3366          * After a GPU reset, we may have requests to replay. Do so now while
3367          * we still have the forcewake to be sure that the GPU is not allowed
3368          * to sleep before we restart and reload a context.
3369          */
3370         GEM_BUG_ON(!reset_in_progress(execlists));
3371         if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
3372                 execlists->tasklet.func(execlists->tasklet.data);
3373
3374         if (__tasklet_enable(&execlists->tasklet))
3375                 /* And kick in case we missed a new request submission. */
3376                 tasklet_hi_schedule(&execlists->tasklet);
3377         ENGINE_TRACE(engine, "depth->%d\n",
3378                      atomic_read(&execlists->tasklet.count));
3379 }
3380
3381 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
3382                                     u64 offset, u32 len,
3383                                     const unsigned int flags)
3384 {
3385         u32 *cs;
3386
3387         cs = intel_ring_begin(rq, 4);
3388         if (IS_ERR(cs))
3389                 return PTR_ERR(cs);
3390
3391         /*
3392          * WaDisableCtxRestoreArbitration:bdw,chv
3393          *
3394          * We don't need to perform MI_ARB_ENABLE as often as we do (in
3395          * particular all the gen that do not need the w/a at all!), if we
3396          * took care to make sure that on every switch into this context
3397          * (both ordinary and for preemption) that arbitrartion was enabled
3398          * we would be fine.  However, for gen8 there is another w/a that
3399          * requires us to not preempt inside GPGPU execution, so we keep
3400          * arbitration disabled for gen8 batches. Arbitration will be
3401          * re-enabled before we close the request
3402          * (engine->emit_fini_breadcrumb).
3403          */
3404         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3405
3406         /* FIXME(BDW+): Address space and security selectors. */
3407         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
3408                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3409         *cs++ = lower_32_bits(offset);
3410         *cs++ = upper_32_bits(offset);
3411
3412         intel_ring_advance(rq, cs);
3413
3414         return 0;
3415 }
3416
3417 static int gen8_emit_bb_start(struct i915_request *rq,
3418                               u64 offset, u32 len,
3419                               const unsigned int flags)
3420 {
3421         u32 *cs;
3422
3423         cs = intel_ring_begin(rq, 6);
3424         if (IS_ERR(cs))
3425                 return PTR_ERR(cs);
3426
3427         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3428
3429         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
3430                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3431         *cs++ = lower_32_bits(offset);
3432         *cs++ = upper_32_bits(offset);
3433
3434         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3435         *cs++ = MI_NOOP;
3436
3437         intel_ring_advance(rq, cs);
3438
3439         return 0;
3440 }
3441
3442 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
3443 {
3444         ENGINE_WRITE(engine, RING_IMR,
3445                      ~(engine->irq_enable_mask | engine->irq_keep_mask));
3446         ENGINE_POSTING_READ(engine, RING_IMR);
3447 }
3448
3449 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
3450 {
3451         ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
3452 }
3453
3454 static int gen8_emit_flush(struct i915_request *request, u32 mode)
3455 {
3456         u32 cmd, *cs;
3457
3458         cs = intel_ring_begin(request, 4);
3459         if (IS_ERR(cs))
3460                 return PTR_ERR(cs);
3461
3462         cmd = MI_FLUSH_DW + 1;
3463
3464         /* We always require a command barrier so that subsequent
3465          * commands, such as breadcrumb interrupts, are strictly ordered
3466          * wrt the contents of the write cache being flushed to memory
3467          * (and thus being coherent from the CPU).
3468          */
3469         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
3470
3471         if (mode & EMIT_INVALIDATE) {
3472                 cmd |= MI_INVALIDATE_TLB;
3473                 if (request->engine->class == VIDEO_DECODE_CLASS)
3474                         cmd |= MI_INVALIDATE_BSD;
3475         }
3476
3477         *cs++ = cmd;
3478         *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
3479         *cs++ = 0; /* upper addr */
3480         *cs++ = 0; /* value */
3481         intel_ring_advance(request, cs);
3482
3483         return 0;
3484 }
3485
3486 static int gen8_emit_flush_render(struct i915_request *request,
3487                                   u32 mode)
3488 {
3489         bool vf_flush_wa = false, dc_flush_wa = false;
3490         u32 *cs, flags = 0;
3491         int len;
3492
3493         flags |= PIPE_CONTROL_CS_STALL;
3494
3495         if (mode & EMIT_FLUSH) {
3496                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3497                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3498                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3499                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
3500         }
3501
3502         if (mode & EMIT_INVALIDATE) {
3503                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
3504                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3505                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3506                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3507                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3508                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3509                 flags |= PIPE_CONTROL_QW_WRITE;
3510                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3511
3512                 /*
3513                  * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
3514                  * pipe control.
3515                  */
3516                 if (IS_GEN(request->i915, 9))
3517                         vf_flush_wa = true;
3518
3519                 /* WaForGAMHang:kbl */
3520                 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
3521                         dc_flush_wa = true;
3522         }
3523
3524         len = 6;
3525
3526         if (vf_flush_wa)
3527                 len += 6;
3528
3529         if (dc_flush_wa)
3530                 len += 12;
3531
3532         cs = intel_ring_begin(request, len);
3533         if (IS_ERR(cs))
3534                 return PTR_ERR(cs);
3535
3536         if (vf_flush_wa)
3537                 cs = gen8_emit_pipe_control(cs, 0, 0);
3538
3539         if (dc_flush_wa)
3540                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
3541                                             0);
3542
3543         cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3544
3545         if (dc_flush_wa)
3546                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
3547
3548         intel_ring_advance(request, cs);
3549
3550         return 0;
3551 }
3552
3553 static int gen11_emit_flush_render(struct i915_request *request,
3554                                    u32 mode)
3555 {
3556         if (mode & EMIT_FLUSH) {
3557                 u32 *cs;
3558                 u32 flags = 0;
3559
3560                 flags |= PIPE_CONTROL_CS_STALL;
3561
3562                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
3563                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3564                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3565                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3566                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
3567                 flags |= PIPE_CONTROL_QW_WRITE;
3568                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3569
3570                 cs = intel_ring_begin(request, 6);
3571                 if (IS_ERR(cs))
3572                         return PTR_ERR(cs);
3573
3574                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3575                 intel_ring_advance(request, cs);
3576         }
3577
3578         if (mode & EMIT_INVALIDATE) {
3579                 u32 *cs;
3580                 u32 flags = 0;
3581
3582                 flags |= PIPE_CONTROL_CS_STALL;
3583
3584                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
3585                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
3586                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3587                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3588                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3589                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3590                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3591                 flags |= PIPE_CONTROL_QW_WRITE;
3592                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3593
3594                 cs = intel_ring_begin(request, 6);
3595                 if (IS_ERR(cs))
3596                         return PTR_ERR(cs);
3597
3598                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3599                 intel_ring_advance(request, cs);
3600         }
3601
3602         return 0;
3603 }
3604
3605 static u32 preparser_disable(bool state)
3606 {
3607         return MI_ARB_CHECK | 1 << 8 | state;
3608 }
3609
3610 static int gen12_emit_flush_render(struct i915_request *request,
3611                                    u32 mode)
3612 {
3613         if (mode & EMIT_FLUSH) {
3614                 u32 flags = 0;
3615                 u32 *cs;
3616
3617                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
3618                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3619                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3620                 /* Wa_1409600907:tgl */
3621                 flags |= PIPE_CONTROL_DEPTH_STALL;
3622                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3623                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
3624                 flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
3625
3626                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3627                 flags |= PIPE_CONTROL_QW_WRITE;
3628
3629                 flags |= PIPE_CONTROL_CS_STALL;
3630
3631                 cs = intel_ring_begin(request, 6);
3632                 if (IS_ERR(cs))
3633                         return PTR_ERR(cs);
3634
3635                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3636                 intel_ring_advance(request, cs);
3637         }
3638
3639         if (mode & EMIT_INVALIDATE) {
3640                 u32 flags = 0;
3641                 u32 *cs;
3642
3643                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
3644                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
3645                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3646                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3647                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3648                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3649                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3650                 flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE;
3651
3652                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3653                 flags |= PIPE_CONTROL_QW_WRITE;
3654
3655                 flags |= PIPE_CONTROL_CS_STALL;
3656
3657                 cs = intel_ring_begin(request, 8);
3658                 if (IS_ERR(cs))
3659                         return PTR_ERR(cs);
3660
3661                 /*
3662                  * Prevent the pre-parser from skipping past the TLB
3663                  * invalidate and loading a stale page for the batch
3664                  * buffer / request payload.
3665                  */
3666                 *cs++ = preparser_disable(true);
3667
3668                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3669
3670                 *cs++ = preparser_disable(false);
3671                 intel_ring_advance(request, cs);
3672
3673                 /*
3674                  * Wa_1604544889:tgl
3675                  */
3676                 if (IS_TGL_REVID(request->i915, TGL_REVID_A0, TGL_REVID_A0)) {
3677                         flags = 0;
3678                         flags |= PIPE_CONTROL_CS_STALL;
3679                         flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
3680
3681                         flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3682                         flags |= PIPE_CONTROL_QW_WRITE;
3683
3684                         cs = intel_ring_begin(request, 6);
3685                         if (IS_ERR(cs))
3686                                 return PTR_ERR(cs);
3687
3688                         cs = gen8_emit_pipe_control(cs, flags,
3689                                                     LRC_PPHWSP_SCRATCH_ADDR);
3690                         intel_ring_advance(request, cs);
3691                 }
3692         }
3693
3694         return 0;
3695 }
3696
3697 /*
3698  * Reserve space for 2 NOOPs at the end of each request to be
3699  * used as a workaround for not being allowed to do lite
3700  * restore with HEAD==TAIL (WaIdleLiteRestore).
3701  */
3702 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
3703 {
3704         /* Ensure there's always at least one preemption point per-request. */
3705         *cs++ = MI_ARB_CHECK;
3706         *cs++ = MI_NOOP;
3707         request->wa_tail = intel_ring_offset(request, cs);
3708
3709         return cs;
3710 }
3711
3712 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
3713 {
3714         *cs++ = MI_SEMAPHORE_WAIT |
3715                 MI_SEMAPHORE_GLOBAL_GTT |
3716                 MI_SEMAPHORE_POLL |
3717                 MI_SEMAPHORE_SAD_EQ_SDD;
3718         *cs++ = 0;
3719         *cs++ = intel_hws_preempt_address(request->engine);
3720         *cs++ = 0;
3721
3722         return cs;
3723 }
3724
3725 static __always_inline u32*
3726 gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
3727                                  u32 *cs)
3728 {
3729         *cs++ = MI_USER_INTERRUPT;
3730
3731         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3732         if (intel_engine_has_semaphores(request->engine))
3733                 cs = emit_preempt_busywait(request, cs);
3734
3735         request->tail = intel_ring_offset(request, cs);
3736         assert_ring_tail_valid(request->ring, request->tail);
3737
3738         return gen8_emit_wa_tail(request, cs);
3739 }
3740
3741 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
3742 {
3743         cs = gen8_emit_ggtt_write(cs,
3744                                   request->fence.seqno,
3745                                   i915_request_active_timeline(request)->hwsp_offset,
3746                                   0);
3747
3748         return gen8_emit_fini_breadcrumb_footer(request, cs);
3749 }
3750
3751 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
3752 {
3753         cs = gen8_emit_pipe_control(cs,
3754                                     PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
3755                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
3756                                     PIPE_CONTROL_DC_FLUSH_ENABLE,
3757                                     0);
3758
3759         /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
3760         cs = gen8_emit_ggtt_write_rcs(cs,
3761                                       request->fence.seqno,
3762                                       i915_request_active_timeline(request)->hwsp_offset,
3763                                       PIPE_CONTROL_FLUSH_ENABLE |
3764                                       PIPE_CONTROL_CS_STALL);
3765
3766         return gen8_emit_fini_breadcrumb_footer(request, cs);
3767 }
3768
3769 static u32 *
3770 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
3771 {
3772         cs = gen8_emit_ggtt_write_rcs(cs,
3773                                       request->fence.seqno,
3774                                       i915_request_active_timeline(request)->hwsp_offset,
3775                                       PIPE_CONTROL_CS_STALL |
3776                                       PIPE_CONTROL_TILE_CACHE_FLUSH |
3777                                       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
3778                                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
3779                                       PIPE_CONTROL_DC_FLUSH_ENABLE |
3780                                       PIPE_CONTROL_FLUSH_ENABLE);
3781
3782         return gen8_emit_fini_breadcrumb_footer(request, cs);
3783 }
3784
3785 /*
3786  * Note that the CS instruction pre-parser will not stall on the breadcrumb
3787  * flush and will continue pre-fetching the instructions after it before the
3788  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
3789  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
3790  * of the next request before the memory has been flushed, we're guaranteed that
3791  * we won't access the batch itself too early.
3792  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
3793  * so, if the current request is modifying an instruction in the next request on
3794  * the same intel_context, we might pre-fetch and then execute the pre-update
3795  * instruction. To avoid this, the users of self-modifying code should either
3796  * disable the parser around the code emitting the memory writes, via a new flag
3797  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
3798  * the in-kernel use-cases we've opted to use a separate context, see
3799  * reloc_gpu() as an example.
3800  * All the above applies only to the instructions themselves. Non-inline data
3801  * used by the instructions is not pre-fetched.
3802  */
3803
3804 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
3805 {
3806         *cs++ = MI_SEMAPHORE_WAIT_TOKEN |
3807                 MI_SEMAPHORE_GLOBAL_GTT |
3808                 MI_SEMAPHORE_POLL |
3809                 MI_SEMAPHORE_SAD_EQ_SDD;
3810         *cs++ = 0;
3811         *cs++ = intel_hws_preempt_address(request->engine);
3812         *cs++ = 0;
3813         *cs++ = 0;
3814         *cs++ = MI_NOOP;
3815
3816         return cs;
3817 }
3818
3819 static __always_inline u32*
3820 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
3821 {
3822         *cs++ = MI_USER_INTERRUPT;
3823
3824         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3825         if (intel_engine_has_semaphores(request->engine))
3826                 cs = gen12_emit_preempt_busywait(request, cs);
3827
3828         request->tail = intel_ring_offset(request, cs);
3829         assert_ring_tail_valid(request->ring, request->tail);
3830
3831         return gen8_emit_wa_tail(request, cs);
3832 }
3833
3834 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
3835 {
3836         cs = gen8_emit_ggtt_write(cs,
3837                                   request->fence.seqno,
3838                                   i915_request_active_timeline(request)->hwsp_offset,
3839                                   0);
3840
3841         return gen12_emit_fini_breadcrumb_footer(request, cs);
3842 }
3843
3844 static u32 *
3845 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
3846 {
3847         cs = gen8_emit_ggtt_write_rcs(cs,
3848                                       request->fence.seqno,
3849                                       i915_request_active_timeline(request)->hwsp_offset,
3850                                       PIPE_CONTROL_CS_STALL |
3851                                       PIPE_CONTROL_TILE_CACHE_FLUSH |
3852                                       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
3853                                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
3854                                       /* Wa_1409600907:tgl */
3855                                       PIPE_CONTROL_DEPTH_STALL |
3856                                       PIPE_CONTROL_DC_FLUSH_ENABLE |
3857                                       PIPE_CONTROL_FLUSH_ENABLE |
3858                                       PIPE_CONTROL_HDC_PIPELINE_FLUSH);
3859
3860         return gen12_emit_fini_breadcrumb_footer(request, cs);
3861 }
3862
3863 static void execlists_park(struct intel_engine_cs *engine)
3864 {
3865         cancel_timer(&engine->execlists.timer);
3866         cancel_timer(&engine->execlists.preempt);
3867 }
3868
3869 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
3870 {
3871         engine->submit_request = execlists_submit_request;
3872         engine->schedule = i915_schedule;
3873         engine->execlists.tasklet.func = execlists_submission_tasklet;
3874
3875         engine->reset.prepare = execlists_reset_prepare;
3876         engine->reset.rewind = execlists_reset_rewind;
3877         engine->reset.cancel = execlists_reset_cancel;
3878         engine->reset.finish = execlists_reset_finish;
3879
3880         engine->park = execlists_park;
3881         engine->unpark = NULL;
3882
3883         engine->flags |= I915_ENGINE_SUPPORTS_STATS;
3884         if (!intel_vgpu_active(engine->i915)) {
3885                 engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
3886                 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
3887                         engine->flags |= I915_ENGINE_HAS_PREEMPTION;
3888         }
3889
3890         if (INTEL_GEN(engine->i915) >= 12)
3891                 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
3892
3893         if (intel_engine_has_preemption(engine))
3894                 engine->emit_bb_start = gen8_emit_bb_start;
3895         else
3896                 engine->emit_bb_start = gen8_emit_bb_start_noarb;
3897 }
3898
3899 static void execlists_shutdown(struct intel_engine_cs *engine)
3900 {
3901         /* Synchronise with residual timers and any softirq they raise */
3902         del_timer_sync(&engine->execlists.timer);
3903         del_timer_sync(&engine->execlists.preempt);
3904         tasklet_kill(&engine->execlists.tasklet);
3905 }
3906
3907 static void execlists_release(struct intel_engine_cs *engine)
3908 {
3909         execlists_shutdown(engine);
3910
3911         intel_engine_cleanup_common(engine);
3912         lrc_destroy_wa_ctx(engine);
3913 }
3914
3915 static void
3916 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
3917 {
3918         /* Default vfuncs which can be overriden by each engine. */
3919
3920         engine->resume = execlists_resume;
3921
3922         engine->cops = &execlists_context_ops;
3923         engine->request_alloc = execlists_request_alloc;
3924
3925         engine->emit_flush = gen8_emit_flush;
3926         engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
3927         engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
3928         if (INTEL_GEN(engine->i915) >= 12)
3929                 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
3930
3931         engine->set_default_submission = intel_execlists_set_default_submission;
3932
3933         if (INTEL_GEN(engine->i915) < 11) {
3934                 engine->irq_enable = gen8_logical_ring_enable_irq;
3935                 engine->irq_disable = gen8_logical_ring_disable_irq;
3936         } else {
3937                 /*
3938                  * TODO: On Gen11 interrupt masks need to be clear
3939                  * to allow C6 entry. Keep interrupts enabled at
3940                  * and take the hit of generating extra interrupts
3941                  * until a more refined solution exists.
3942                  */
3943         }
3944 }
3945
3946 static inline void
3947 logical_ring_default_irqs(struct intel_engine_cs *engine)
3948 {
3949         unsigned int shift = 0;
3950
3951         if (INTEL_GEN(engine->i915) < 11) {
3952                 const u8 irq_shifts[] = {
3953                         [RCS0]  = GEN8_RCS_IRQ_SHIFT,
3954                         [BCS0]  = GEN8_BCS_IRQ_SHIFT,
3955                         [VCS0]  = GEN8_VCS0_IRQ_SHIFT,
3956                         [VCS1]  = GEN8_VCS1_IRQ_SHIFT,
3957                         [VECS0] = GEN8_VECS_IRQ_SHIFT,
3958                 };
3959
3960                 shift = irq_shifts[engine->id];
3961         }
3962
3963         engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
3964         engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
3965 }
3966
3967 static void rcs_submission_override(struct intel_engine_cs *engine)
3968 {
3969         switch (INTEL_GEN(engine->i915)) {
3970         case 12:
3971                 engine->emit_flush = gen12_emit_flush_render;
3972                 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
3973                 break;
3974         case 11:
3975                 engine->emit_flush = gen11_emit_flush_render;
3976                 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
3977                 break;
3978         default:
3979                 engine->emit_flush = gen8_emit_flush_render;
3980                 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
3981                 break;
3982         }
3983 }
3984
3985 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
3986 {
3987         struct intel_engine_execlists * const execlists = &engine->execlists;
3988         struct drm_i915_private *i915 = engine->i915;
3989         struct intel_uncore *uncore = engine->uncore;
3990         u32 base = engine->mmio_base;
3991
3992         tasklet_init(&engine->execlists.tasklet,
3993                      execlists_submission_tasklet, (unsigned long)engine);
3994         timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
3995         timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
3996
3997         logical_ring_default_vfuncs(engine);
3998         logical_ring_default_irqs(engine);
3999
4000         if (engine->class == RENDER_CLASS)
4001                 rcs_submission_override(engine);
4002
4003         if (intel_init_workaround_bb(engine))
4004                 /*
4005                  * We continue even if we fail to initialize WA batch
4006                  * because we only expect rare glitches but nothing
4007                  * critical to prevent us from using GPU
4008                  */
4009                 DRM_ERROR("WA batch buffer initialization failed\n");
4010
4011         if (HAS_LOGICAL_RING_ELSQ(i915)) {
4012                 execlists->submit_reg = uncore->regs +
4013                         i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
4014                 execlists->ctrl_reg = uncore->regs +
4015                         i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
4016         } else {
4017                 execlists->submit_reg = uncore->regs +
4018                         i915_mmio_reg_offset(RING_ELSP(base));
4019         }
4020
4021         execlists->csb_status =
4022                 &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
4023
4024         execlists->csb_write =
4025                 &engine->status_page.addr[intel_hws_csb_write_index(i915)];
4026
4027         if (INTEL_GEN(i915) < 11)
4028                 execlists->csb_size = GEN8_CSB_ENTRIES;
4029         else
4030                 execlists->csb_size = GEN11_CSB_ENTRIES;
4031
4032         reset_csb_pointers(engine);
4033
4034         /* Finally, take ownership and responsibility for cleanup! */
4035         engine->release = execlists_release;
4036
4037         return 0;
4038 }
4039
4040 static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
4041 {
4042         u32 indirect_ctx_offset;
4043
4044         switch (INTEL_GEN(engine->i915)) {
4045         default:
4046                 MISSING_CASE(INTEL_GEN(engine->i915));
4047                 /* fall through */
4048         case 12:
4049                 indirect_ctx_offset =
4050                         GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4051                 break;
4052         case 11:
4053                 indirect_ctx_offset =
4054                         GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4055                 break;
4056         case 10:
4057                 indirect_ctx_offset =
4058                         GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4059                 break;
4060         case 9:
4061                 indirect_ctx_offset =
4062                         GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4063                 break;
4064         case 8:
4065                 indirect_ctx_offset =
4066                         GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4067                 break;
4068         }
4069
4070         return indirect_ctx_offset;
4071 }
4072
4073
4074 static void init_common_reg_state(u32 * const regs,
4075                                   const struct intel_engine_cs *engine,
4076                                   const struct intel_ring *ring,
4077                                   bool inhibit)
4078 {
4079         u32 ctl;
4080
4081         ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
4082         ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
4083         if (inhibit)
4084                 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
4085         if (INTEL_GEN(engine->i915) < 11)
4086                 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
4087                                            CTX_CTRL_RS_CTX_ENABLE);
4088         regs[CTX_CONTEXT_CONTROL] = ctl;
4089
4090         regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
4091 }
4092
4093 static void init_wa_bb_reg_state(u32 * const regs,
4094                                  const struct intel_engine_cs *engine,
4095                                  u32 pos_bb_per_ctx)
4096 {
4097         const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
4098
4099         if (wa_ctx->per_ctx.size) {
4100                 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4101
4102                 regs[pos_bb_per_ctx] =
4103                         (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
4104         }
4105
4106         if (wa_ctx->indirect_ctx.size) {
4107                 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4108
4109                 regs[pos_bb_per_ctx + 2] =
4110                         (ggtt_offset + wa_ctx->indirect_ctx.offset) |
4111                         (wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
4112
4113                 regs[pos_bb_per_ctx + 4] =
4114                         intel_lr_indirect_ctx_offset(engine) << 6;
4115         }
4116 }
4117
4118 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
4119 {
4120         if (i915_vm_is_4lvl(&ppgtt->vm)) {
4121                 /* 64b PPGTT (48bit canonical)
4122                  * PDP0_DESCRIPTOR contains the base address to PML4 and
4123                  * other PDP Descriptors are ignored.
4124                  */
4125                 ASSIGN_CTX_PML4(ppgtt, regs);
4126         } else {
4127                 ASSIGN_CTX_PDP(ppgtt, regs, 3);
4128                 ASSIGN_CTX_PDP(ppgtt, regs, 2);
4129                 ASSIGN_CTX_PDP(ppgtt, regs, 1);
4130                 ASSIGN_CTX_PDP(ppgtt, regs, 0);
4131         }
4132 }
4133
4134 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
4135 {
4136         if (i915_is_ggtt(vm))
4137                 return i915_vm_to_ggtt(vm)->alias;
4138         else
4139                 return i915_vm_to_ppgtt(vm);
4140 }
4141
4142 static void execlists_init_reg_state(u32 *regs,
4143                                      const struct intel_context *ce,
4144                                      const struct intel_engine_cs *engine,
4145                                      const struct intel_ring *ring,
4146                                      bool inhibit)
4147 {
4148         /*
4149          * A context is actually a big batch buffer with several
4150          * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
4151          * values we are setting here are only for the first context restore:
4152          * on a subsequent save, the GPU will recreate this batchbuffer with new
4153          * values (including all the missing MI_LOAD_REGISTER_IMM commands that
4154          * we are not initializing here).
4155          *
4156          * Must keep consistent with virtual_update_register_offsets().
4157          */
4158         set_offsets(regs, reg_offsets(engine), engine, inhibit);
4159
4160         init_common_reg_state(regs, engine, ring, inhibit);
4161         init_ppgtt_reg_state(regs, vm_alias(ce->vm));
4162
4163         init_wa_bb_reg_state(regs, engine,
4164                              INTEL_GEN(engine->i915) >= 12 ?
4165                              GEN12_CTX_BB_PER_CTX_PTR :
4166                              CTX_BB_PER_CTX_PTR);
4167
4168         __reset_stop_ring(regs, engine);
4169 }
4170
4171 static int
4172 populate_lr_context(struct intel_context *ce,
4173                     struct drm_i915_gem_object *ctx_obj,
4174                     struct intel_engine_cs *engine,
4175                     struct intel_ring *ring)
4176 {
4177         bool inhibit = true;
4178         void *vaddr;
4179         int ret;
4180
4181         vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
4182         if (IS_ERR(vaddr)) {
4183                 ret = PTR_ERR(vaddr);
4184                 DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
4185                 return ret;
4186         }
4187
4188         set_redzone(vaddr, engine);
4189
4190         if (engine->default_state) {
4191                 void *defaults;
4192
4193                 defaults = i915_gem_object_pin_map(engine->default_state,
4194                                                    I915_MAP_WB);
4195                 if (IS_ERR(defaults)) {
4196                         ret = PTR_ERR(defaults);
4197                         goto err_unpin_ctx;
4198                 }
4199
4200                 memcpy(vaddr, defaults, engine->context_size);
4201                 i915_gem_object_unpin_map(engine->default_state);
4202                 __set_bit(CONTEXT_VALID_BIT, &ce->flags);
4203                 inhibit = false;
4204         }
4205
4206         /* The second page of the context object contains some fields which must
4207          * be set up prior to the first execution. */
4208         execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE,
4209                                  ce, engine, ring, inhibit);
4210
4211         ret = 0;
4212 err_unpin_ctx:
4213         __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
4214         i915_gem_object_unpin_map(ctx_obj);
4215         return ret;
4216 }
4217
4218 static int __execlists_context_alloc(struct intel_context *ce,
4219                                      struct intel_engine_cs *engine)
4220 {
4221         struct drm_i915_gem_object *ctx_obj;
4222         struct intel_ring *ring;
4223         struct i915_vma *vma;
4224         u32 context_size;
4225         int ret;
4226
4227         GEM_BUG_ON(ce->state);
4228         context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
4229
4230         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4231                 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
4232
4233         ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
4234         if (IS_ERR(ctx_obj))
4235                 return PTR_ERR(ctx_obj);
4236
4237         vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
4238         if (IS_ERR(vma)) {
4239                 ret = PTR_ERR(vma);
4240                 goto error_deref_obj;
4241         }
4242
4243         if (!ce->timeline) {
4244                 struct intel_timeline *tl;
4245
4246                 tl = intel_timeline_create(engine->gt, NULL);
4247                 if (IS_ERR(tl)) {
4248                         ret = PTR_ERR(tl);
4249                         goto error_deref_obj;
4250                 }
4251
4252                 ce->timeline = tl;
4253         }
4254
4255         ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
4256         if (IS_ERR(ring)) {
4257                 ret = PTR_ERR(ring);
4258                 goto error_deref_obj;
4259         }
4260
4261         ret = populate_lr_context(ce, ctx_obj, engine, ring);
4262         if (ret) {
4263                 DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
4264                 goto error_ring_free;
4265         }
4266
4267         ce->ring = ring;
4268         ce->state = vma;
4269
4270         return 0;
4271
4272 error_ring_free:
4273         intel_ring_put(ring);
4274 error_deref_obj:
4275         i915_gem_object_put(ctx_obj);
4276         return ret;
4277 }
4278
4279 static struct list_head *virtual_queue(struct virtual_engine *ve)
4280 {
4281         return &ve->base.execlists.default_priolist.requests[0];
4282 }
4283
4284 static void virtual_context_destroy(struct kref *kref)
4285 {
4286         struct virtual_engine *ve =
4287                 container_of(kref, typeof(*ve), context.ref);
4288         unsigned int n;
4289
4290         GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4291         GEM_BUG_ON(ve->request);
4292         GEM_BUG_ON(ve->context.inflight);
4293
4294         for (n = 0; n < ve->num_siblings; n++) {
4295                 struct intel_engine_cs *sibling = ve->siblings[n];
4296                 struct rb_node *node = &ve->nodes[sibling->id].rb;
4297                 unsigned long flags;
4298
4299                 if (RB_EMPTY_NODE(node))
4300                         continue;
4301
4302                 spin_lock_irqsave(&sibling->active.lock, flags);
4303
4304                 /* Detachment is lazily performed in the execlists tasklet */
4305                 if (!RB_EMPTY_NODE(node))
4306                         rb_erase_cached(node, &sibling->execlists.virtual);
4307
4308                 spin_unlock_irqrestore(&sibling->active.lock, flags);
4309         }
4310         GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
4311
4312         if (ve->context.state)
4313                 __execlists_context_fini(&ve->context);
4314         intel_context_fini(&ve->context);
4315
4316         kfree(ve->bonds);
4317         kfree(ve);
4318 }
4319
4320 static void virtual_engine_initial_hint(struct virtual_engine *ve)
4321 {
4322         int swp;
4323
4324         /*
4325          * Pick a random sibling on starting to help spread the load around.
4326          *
4327          * New contexts are typically created with exactly the same order
4328          * of siblings, and often started in batches. Due to the way we iterate
4329          * the array of sibling when submitting requests, sibling[0] is
4330          * prioritised for dequeuing. If we make sure that sibling[0] is fairly
4331          * randomised across the system, we also help spread the load by the
4332          * first engine we inspect being different each time.
4333          *
4334          * NB This does not force us to execute on this engine, it will just
4335          * typically be the first we inspect for submission.
4336          */
4337         swp = prandom_u32_max(ve->num_siblings);
4338         if (!swp)
4339                 return;
4340
4341         swap(ve->siblings[swp], ve->siblings[0]);
4342         if (!intel_engine_has_relative_mmio(ve->siblings[0]))
4343                 virtual_update_register_offsets(ve->context.lrc_reg_state,
4344                                                 ve->siblings[0]);
4345 }
4346
4347 static int virtual_context_alloc(struct intel_context *ce)
4348 {
4349         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4350
4351         return __execlists_context_alloc(ce, ve->siblings[0]);
4352 }
4353
4354 static int virtual_context_pin(struct intel_context *ce)
4355 {
4356         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4357         int err;
4358
4359         /* Note: we must use a real engine class for setting up reg state */
4360         err = __execlists_context_pin(ce, ve->siblings[0]);
4361         if (err)
4362                 return err;
4363
4364         virtual_engine_initial_hint(ve);
4365         return 0;
4366 }
4367
4368 static void virtual_context_enter(struct intel_context *ce)
4369 {
4370         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4371         unsigned int n;
4372
4373         for (n = 0; n < ve->num_siblings; n++)
4374                 intel_engine_pm_get(ve->siblings[n]);
4375
4376         intel_timeline_enter(ce->timeline);
4377 }
4378
4379 static void virtual_context_exit(struct intel_context *ce)
4380 {
4381         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4382         unsigned int n;
4383
4384         intel_timeline_exit(ce->timeline);
4385
4386         for (n = 0; n < ve->num_siblings; n++)
4387                 intel_engine_pm_put(ve->siblings[n]);
4388 }
4389
4390 static const struct intel_context_ops virtual_context_ops = {
4391         .alloc = virtual_context_alloc,
4392
4393         .pin = virtual_context_pin,
4394         .unpin = execlists_context_unpin,
4395
4396         .enter = virtual_context_enter,
4397         .exit = virtual_context_exit,
4398
4399         .destroy = virtual_context_destroy,
4400 };
4401
4402 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
4403 {
4404         struct i915_request *rq;
4405         intel_engine_mask_t mask;
4406
4407         rq = READ_ONCE(ve->request);
4408         if (!rq)
4409                 return 0;
4410
4411         /* The rq is ready for submission; rq->execution_mask is now stable. */
4412         mask = rq->execution_mask;
4413         if (unlikely(!mask)) {
4414                 /* Invalid selection, submit to a random engine in error */
4415                 i915_request_skip(rq, -ENODEV);
4416                 mask = ve->siblings[0]->mask;
4417         }
4418
4419         ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
4420                      rq->fence.context, rq->fence.seqno,
4421                      mask, ve->base.execlists.queue_priority_hint);
4422
4423         return mask;
4424 }
4425
4426 static void virtual_submission_tasklet(unsigned long data)
4427 {
4428         struct virtual_engine * const ve = (struct virtual_engine *)data;
4429         const int prio = ve->base.execlists.queue_priority_hint;
4430         intel_engine_mask_t mask;
4431         unsigned int n;
4432
4433         rcu_read_lock();
4434         mask = virtual_submission_mask(ve);
4435         rcu_read_unlock();
4436         if (unlikely(!mask))
4437                 return;
4438
4439         local_irq_disable();
4440         for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
4441                 struct intel_engine_cs *sibling = ve->siblings[n];
4442                 struct ve_node * const node = &ve->nodes[sibling->id];
4443                 struct rb_node **parent, *rb;
4444                 bool first;
4445
4446                 if (unlikely(!(mask & sibling->mask))) {
4447                         if (!RB_EMPTY_NODE(&node->rb)) {
4448                                 spin_lock(&sibling->active.lock);
4449                                 rb_erase_cached(&node->rb,
4450                                                 &sibling->execlists.virtual);
4451                                 RB_CLEAR_NODE(&node->rb);
4452                                 spin_unlock(&sibling->active.lock);
4453                         }
4454                         continue;
4455                 }
4456
4457                 spin_lock(&sibling->active.lock);
4458
4459                 if (!RB_EMPTY_NODE(&node->rb)) {
4460                         /*
4461                          * Cheat and avoid rebalancing the tree if we can
4462                          * reuse this node in situ.
4463                          */
4464                         first = rb_first_cached(&sibling->execlists.virtual) ==
4465                                 &node->rb;
4466                         if (prio == node->prio || (prio > node->prio && first))
4467                                 goto submit_engine;
4468
4469                         rb_erase_cached(&node->rb, &sibling->execlists.virtual);
4470                 }
4471
4472                 rb = NULL;
4473                 first = true;
4474                 parent = &sibling->execlists.virtual.rb_root.rb_node;
4475                 while (*parent) {
4476                         struct ve_node *other;
4477
4478                         rb = *parent;
4479                         other = rb_entry(rb, typeof(*other), rb);
4480                         if (prio > other->prio) {
4481                                 parent = &rb->rb_left;
4482                         } else {
4483                                 parent = &rb->rb_right;
4484                                 first = false;
4485                         }
4486                 }
4487
4488                 rb_link_node(&node->rb, rb, parent);
4489                 rb_insert_color_cached(&node->rb,
4490                                        &sibling->execlists.virtual,
4491                                        first);
4492
4493 submit_engine:
4494                 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
4495                 node->prio = prio;
4496                 if (first && prio > sibling->execlists.queue_priority_hint) {
4497                         sibling->execlists.queue_priority_hint = prio;
4498                         tasklet_hi_schedule(&sibling->execlists.tasklet);
4499                 }
4500
4501                 spin_unlock(&sibling->active.lock);
4502         }
4503         local_irq_enable();
4504 }
4505
4506 static void virtual_submit_request(struct i915_request *rq)
4507 {
4508         struct virtual_engine *ve = to_virtual_engine(rq->engine);
4509         struct i915_request *old;
4510         unsigned long flags;
4511
4512         ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
4513                      rq->fence.context,
4514                      rq->fence.seqno);
4515
4516         GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
4517
4518         spin_lock_irqsave(&ve->base.active.lock, flags);
4519
4520         old = ve->request;
4521         if (old) { /* background completion event from preempt-to-busy */
4522                 GEM_BUG_ON(!i915_request_completed(old));
4523                 __i915_request_submit(old);
4524                 i915_request_put(old);
4525         }
4526
4527         if (i915_request_completed(rq)) {
4528                 __i915_request_submit(rq);
4529
4530                 ve->base.execlists.queue_priority_hint = INT_MIN;
4531                 ve->request = NULL;
4532         } else {
4533                 ve->base.execlists.queue_priority_hint = rq_prio(rq);
4534                 ve->request = i915_request_get(rq);
4535
4536                 GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4537                 list_move_tail(&rq->sched.link, virtual_queue(ve));
4538
4539                 tasklet_schedule(&ve->base.execlists.tasklet);
4540         }
4541
4542         spin_unlock_irqrestore(&ve->base.active.lock, flags);
4543 }
4544
4545 static struct ve_bond *
4546 virtual_find_bond(struct virtual_engine *ve,
4547                   const struct intel_engine_cs *master)
4548 {
4549         int i;
4550
4551         for (i = 0; i < ve->num_bonds; i++) {
4552                 if (ve->bonds[i].master == master)
4553                         return &ve->bonds[i];
4554         }
4555
4556         return NULL;
4557 }
4558
4559 static void
4560 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
4561 {
4562         struct virtual_engine *ve = to_virtual_engine(rq->engine);
4563         intel_engine_mask_t allowed, exec;
4564         struct ve_bond *bond;
4565
4566         allowed = ~to_request(signal)->engine->mask;
4567
4568         bond = virtual_find_bond(ve, to_request(signal)->engine);
4569         if (bond)
4570                 allowed &= bond->sibling_mask;
4571
4572         /* Restrict the bonded request to run on only the available engines */
4573         exec = READ_ONCE(rq->execution_mask);
4574         while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
4575                 ;
4576
4577         /* Prevent the master from being re-run on the bonded engines */
4578         to_request(signal)->execution_mask &= ~allowed;
4579 }
4580
4581 struct intel_context *
4582 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
4583                                unsigned int count)
4584 {
4585         struct virtual_engine *ve;
4586         unsigned int n;
4587         int err;
4588
4589         if (count == 0)
4590                 return ERR_PTR(-EINVAL);
4591
4592         if (count == 1)
4593                 return intel_context_create(siblings[0]);
4594
4595         ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
4596         if (!ve)
4597                 return ERR_PTR(-ENOMEM);
4598
4599         ve->base.i915 = siblings[0]->i915;
4600         ve->base.gt = siblings[0]->gt;
4601         ve->base.uncore = siblings[0]->uncore;
4602         ve->base.id = -1;
4603
4604         ve->base.class = OTHER_CLASS;
4605         ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
4606         ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
4607         ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
4608
4609         /*
4610          * The decision on whether to submit a request using semaphores
4611          * depends on the saturated state of the engine. We only compute
4612          * this during HW submission of the request, and we need for this
4613          * state to be globally applied to all requests being submitted
4614          * to this engine. Virtual engines encompass more than one physical
4615          * engine and so we cannot accurately tell in advance if one of those
4616          * engines is already saturated and so cannot afford to use a semaphore
4617          * and be pessimized in priority for doing so -- if we are the only
4618          * context using semaphores after all other clients have stopped, we
4619          * will be starved on the saturated system. Such a global switch for
4620          * semaphores is less than ideal, but alas is the current compromise.
4621          */
4622         ve->base.saturated = ALL_ENGINES;
4623
4624         snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
4625
4626         intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
4627         intel_engine_init_breadcrumbs(&ve->base);
4628         intel_engine_init_execlists(&ve->base);
4629
4630         ve->base.cops = &virtual_context_ops;
4631         ve->base.request_alloc = execlists_request_alloc;
4632
4633         ve->base.schedule = i915_schedule;
4634         ve->base.submit_request = virtual_submit_request;
4635         ve->base.bond_execute = virtual_bond_execute;
4636
4637         INIT_LIST_HEAD(virtual_queue(ve));
4638         ve->base.execlists.queue_priority_hint = INT_MIN;
4639         tasklet_init(&ve->base.execlists.tasklet,
4640                      virtual_submission_tasklet,
4641                      (unsigned long)ve);
4642
4643         intel_context_init(&ve->context, &ve->base);
4644
4645         for (n = 0; n < count; n++) {
4646                 struct intel_engine_cs *sibling = siblings[n];
4647
4648                 GEM_BUG_ON(!is_power_of_2(sibling->mask));
4649                 if (sibling->mask & ve->base.mask) {
4650                         DRM_DEBUG("duplicate %s entry in load balancer\n",
4651                                   sibling->name);
4652                         err = -EINVAL;
4653                         goto err_put;
4654                 }
4655
4656                 /*
4657                  * The virtual engine implementation is tightly coupled to
4658                  * the execlists backend -- we push out request directly
4659                  * into a tree inside each physical engine. We could support
4660                  * layering if we handle cloning of the requests and
4661                  * submitting a copy into each backend.
4662                  */
4663                 if (sibling->execlists.tasklet.func !=
4664                     execlists_submission_tasklet) {
4665                         err = -ENODEV;
4666                         goto err_put;
4667                 }
4668
4669                 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
4670                 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
4671
4672                 ve->siblings[ve->num_siblings++] = sibling;
4673                 ve->base.mask |= sibling->mask;
4674
4675                 /*
4676                  * All physical engines must be compatible for their emission
4677                  * functions (as we build the instructions during request
4678                  * construction and do not alter them before submission
4679                  * on the physical engine). We use the engine class as a guide
4680                  * here, although that could be refined.
4681                  */
4682                 if (ve->base.class != OTHER_CLASS) {
4683                         if (ve->base.class != sibling->class) {
4684                                 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
4685                                           sibling->class, ve->base.class);
4686                                 err = -EINVAL;
4687                                 goto err_put;
4688                         }
4689                         continue;
4690                 }
4691
4692                 ve->base.class = sibling->class;
4693                 ve->base.uabi_class = sibling->uabi_class;
4694                 snprintf(ve->base.name, sizeof(ve->base.name),
4695                          "v%dx%d", ve->base.class, count);
4696                 ve->base.context_size = sibling->context_size;
4697
4698                 ve->base.emit_bb_start = sibling->emit_bb_start;
4699                 ve->base.emit_flush = sibling->emit_flush;
4700                 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
4701                 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
4702                 ve->base.emit_fini_breadcrumb_dw =
4703                         sibling->emit_fini_breadcrumb_dw;
4704
4705                 ve->base.flags = sibling->flags;
4706         }
4707
4708         ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
4709
4710         return &ve->context;
4711
4712 err_put:
4713         intel_context_put(&ve->context);
4714         return ERR_PTR(err);
4715 }
4716
4717 struct intel_context *
4718 intel_execlists_clone_virtual(struct intel_engine_cs *src)
4719 {
4720         struct virtual_engine *se = to_virtual_engine(src);
4721         struct intel_context *dst;
4722
4723         dst = intel_execlists_create_virtual(se->siblings,
4724                                              se->num_siblings);
4725         if (IS_ERR(dst))
4726                 return dst;
4727
4728         if (se->num_bonds) {
4729                 struct virtual_engine *de = to_virtual_engine(dst->engine);
4730
4731                 de->bonds = kmemdup(se->bonds,
4732                                     sizeof(*se->bonds) * se->num_bonds,
4733                                     GFP_KERNEL);
4734                 if (!de->bonds) {
4735                         intel_context_put(dst);
4736                         return ERR_PTR(-ENOMEM);
4737                 }
4738
4739                 de->num_bonds = se->num_bonds;
4740         }
4741
4742         return dst;
4743 }
4744
4745 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
4746                                      const struct intel_engine_cs *master,
4747                                      const struct intel_engine_cs *sibling)
4748 {
4749         struct virtual_engine *ve = to_virtual_engine(engine);
4750         struct ve_bond *bond;
4751         int n;
4752
4753         /* Sanity check the sibling is part of the virtual engine */
4754         for (n = 0; n < ve->num_siblings; n++)
4755                 if (sibling == ve->siblings[n])
4756                         break;
4757         if (n == ve->num_siblings)
4758                 return -EINVAL;
4759
4760         bond = virtual_find_bond(ve, master);
4761         if (bond) {
4762                 bond->sibling_mask |= sibling->mask;
4763                 return 0;
4764         }
4765
4766         bond = krealloc(ve->bonds,
4767                         sizeof(*bond) * (ve->num_bonds + 1),
4768                         GFP_KERNEL);
4769         if (!bond)
4770                 return -ENOMEM;
4771
4772         bond[ve->num_bonds].master = master;
4773         bond[ve->num_bonds].sibling_mask = sibling->mask;
4774
4775         ve->bonds = bond;
4776         ve->num_bonds++;
4777
4778         return 0;
4779 }
4780
4781 struct intel_engine_cs *
4782 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
4783                                  unsigned int sibling)
4784 {
4785         struct virtual_engine *ve = to_virtual_engine(engine);
4786
4787         if (sibling >= ve->num_siblings)
4788                 return NULL;
4789
4790         return ve->siblings[sibling];
4791 }
4792
4793 void intel_execlists_show_requests(struct intel_engine_cs *engine,
4794                                    struct drm_printer *m,
4795                                    void (*show_request)(struct drm_printer *m,
4796                                                         struct i915_request *rq,
4797                                                         const char *prefix),
4798                                    unsigned int max)
4799 {
4800         const struct intel_engine_execlists *execlists = &engine->execlists;
4801         struct i915_request *rq, *last;
4802         unsigned long flags;
4803         unsigned int count;
4804         struct rb_node *rb;
4805
4806         spin_lock_irqsave(&engine->active.lock, flags);
4807
4808         last = NULL;
4809         count = 0;
4810         list_for_each_entry(rq, &engine->active.requests, sched.link) {
4811                 if (count++ < max - 1)
4812                         show_request(m, rq, "\t\tE ");
4813                 else
4814                         last = rq;
4815         }
4816         if (last) {
4817                 if (count > max) {
4818                         drm_printf(m,
4819                                    "\t\t...skipping %d executing requests...\n",
4820                                    count - max);
4821                 }
4822                 show_request(m, last, "\t\tE ");
4823         }
4824
4825         last = NULL;
4826         count = 0;
4827         if (execlists->queue_priority_hint != INT_MIN)
4828                 drm_printf(m, "\t\tQueue priority hint: %d\n",
4829                            execlists->queue_priority_hint);
4830         for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
4831                 struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
4832                 int i;
4833
4834                 priolist_for_each_request(rq, p, i) {
4835                         if (count++ < max - 1)
4836                                 show_request(m, rq, "\t\tQ ");
4837                         else
4838                                 last = rq;
4839                 }
4840         }
4841         if (last) {
4842                 if (count > max) {
4843                         drm_printf(m,
4844                                    "\t\t...skipping %d queued requests...\n",
4845                                    count - max);
4846                 }
4847                 show_request(m, last, "\t\tQ ");
4848         }
4849
4850         last = NULL;
4851         count = 0;
4852         for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
4853                 struct virtual_engine *ve =
4854                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4855                 struct i915_request *rq = READ_ONCE(ve->request);
4856
4857                 if (rq) {
4858                         if (count++ < max - 1)
4859                                 show_request(m, rq, "\t\tV ");
4860                         else
4861                                 last = rq;
4862                 }
4863         }
4864         if (last) {
4865                 if (count > max) {
4866                         drm_printf(m,
4867                                    "\t\t...skipping %d virtual requests...\n",
4868                                    count - max);
4869                 }
4870                 show_request(m, last, "\t\tV ");
4871         }
4872
4873         spin_unlock_irqrestore(&engine->active.lock, flags);
4874 }
4875
4876 void intel_lr_context_reset(struct intel_engine_cs *engine,
4877                             struct intel_context *ce,
4878                             u32 head,
4879                             bool scrub)
4880 {
4881         GEM_BUG_ON(!intel_context_is_pinned(ce));
4882
4883         /*
4884          * We want a simple context + ring to execute the breadcrumb update.
4885          * We cannot rely on the context being intact across the GPU hang,
4886          * so clear it and rebuild just what we need for the breadcrumb.
4887          * All pending requests for this context will be zapped, and any
4888          * future request will be after userspace has had the opportunity
4889          * to recreate its own state.
4890          */
4891         if (scrub)
4892                 restore_default_state(ce, engine);
4893
4894         /* Rerun the request; its payload has been neutered (if guilty). */
4895         ce->ring->head = head;
4896         intel_ring_update_space(ce->ring);
4897
4898         __execlists_update_reg_state(ce, engine);
4899 }
4900
4901 bool
4902 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
4903 {
4904         return engine->set_default_submission ==
4905                intel_execlists_set_default_submission;
4906 }
4907
4908 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
4909 #include "selftest_lrc.c"
4910 #endif