drivers/gpu/drm/i915/i915_gem.c

   1 /*
   2  * Copyright © 2008-2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Eric Anholt <eric@anholt.net>
  25  *
  26  */
  27
  28 #include <drm/drmP.h>
  29 #include <drm/drm_vma_manager.h>
  30 #include <drm/i915_drm.h>
  31 #include "i915_drv.h"
  32 #include "i915_gem_clflush.h"
  33 #include "i915_vgpu.h"
  34 #include "i915_trace.h"
  35 #include "intel_drv.h"
  36 #include "intel_frontbuffer.h"
  37 #include "intel_mocs.h"
  38 #include "intel_workarounds.h"
  39 #include "i915_gemfs.h"
  40 #include <linux/dma-fence-array.h>
  41 #include <linux/kthread.h>
  42 #include <linux/reservation.h>
  43 #include <linux/shmem_fs.h>
  44 #include <linux/slab.h>
  45 #include <linux/stop_machine.h>
  46 #include <linux/swap.h>
  47 #include <linux/pci.h>
  48 #include <linux/dma-buf.h>
  49
  50 static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
  51
  52 static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
  53 {
  54         if (obj->cache_dirty)
  55                 return false;
  56
  57         if (!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE))
  58                 return true;
  59
  60         return obj->pin_global; /* currently in use by HW, keep flushed */
  61 }
  62
  63 static int
  64 insert_mappable_node(struct i915_ggtt *ggtt,
  65                      struct drm_mm_node *node, u32 size)
  66 {
  67         memset(node, 0, sizeof(*node));
  68         return drm_mm_insert_node_in_range(&ggtt->vm.mm, node,
  69                                            size, 0, I915_COLOR_UNEVICTABLE,
  70                                            0, ggtt->mappable_end,
  71                                            DRM_MM_INSERT_LOW);
  72 }
  73
  74 static void
  75 remove_mappable_node(struct drm_mm_node *node)
  76 {
  77         drm_mm_remove_node(node);
  78 }
  79
  80 /* some bookkeeping */
  81 static void i915_gem_info_add_obj(struct drm_i915_private *dev_priv,
  82                                   u64 size)
  83 {
  84         spin_lock(&dev_priv->mm.object_stat_lock);
  85         dev_priv->mm.object_count++;
  86         dev_priv->mm.object_memory += size;
  87         spin_unlock(&dev_priv->mm.object_stat_lock);
  88 }
  89
  90 static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
  91                                      u64 size)
  92 {
  93         spin_lock(&dev_priv->mm.object_stat_lock);
  94         dev_priv->mm.object_count--;
  95         dev_priv->mm.object_memory -= size;
  96         spin_unlock(&dev_priv->mm.object_stat_lock);
  97 }
  98
  99 static int
 100 i915_gem_wait_for_error(struct i915_gpu_error *error)
 101 {
 102         int ret;
 103
 104         might_sleep();
 105
 106         /*
 107          * Only wait 10 seconds for the gpu reset to complete to avoid hanging
 108          * userspace. If it takes that long something really bad is going on and
 109          * we should simply try to bail out and fail as gracefully as possible.
 110          */
 111         ret = wait_event_interruptible_timeout(error->reset_queue,
 112                                                !i915_reset_backoff(error),
 113                                                I915_RESET_TIMEOUT);
 114         if (ret == 0) {
 115                 DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
 116                 return -EIO;
 117         } else if (ret < 0) {
 118                 return ret;
 119         } else {
 120                 return 0;
 121         }
 122 }
 123
 124 int i915_mutex_lock_interruptible(struct drm_device *dev)
 125 {
 126         struct drm_i915_private *dev_priv = to_i915(dev);
 127         int ret;
 128
 129         ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
 130         if (ret)
 131                 return ret;
 132
 133         ret = mutex_lock_interruptible(&dev->struct_mutex);
 134         if (ret)
 135                 return ret;
 136
 137         return 0;
 138 }
 139
 140 static u32 __i915_gem_park(struct drm_i915_private *i915)
 141 {
 142         GEM_TRACE("\n");
 143
 144         lockdep_assert_held(&i915->drm.struct_mutex);
 145         GEM_BUG_ON(i915->gt.active_requests);
 146         GEM_BUG_ON(!list_empty(&i915->gt.active_rings));
 147
 148         if (!i915->gt.awake)
 149                 return I915_EPOCH_INVALID;
 150
 151         GEM_BUG_ON(i915->gt.epoch == I915_EPOCH_INVALID);
 152
 153         /*
 154          * Be paranoid and flush a concurrent interrupt to make sure
 155          * we don't reactivate any irq tasklets after parking.
 156          *
 157          * FIXME: Note that even though we have waited for execlists to be idle,
 158          * there may still be an in-flight interrupt even though the CSB
 159          * is now empty. synchronize_irq() makes sure that a residual interrupt
 160          * is completed before we continue, but it doesn't prevent the HW from
 161          * raising a spurious interrupt later. To complete the shield we should
 162          * coordinate disabling the CS irq with flushing the interrupts.
 163          */
 164         synchronize_irq(i915->drm.irq);
 165
 166         intel_engines_park(i915);
 167         i915_timelines_park(i915);
 168
 169         i915_pmu_gt_parked(i915);
 170         i915_vma_parked(i915);
 171
 172         i915->gt.awake = false;
 173
 174         if (INTEL_GEN(i915) >= 6)
 175                 gen6_rps_idle(i915);
 176
 177         if (NEEDS_RC6_CTX_CORRUPTION_WA(i915)) {
 178                 i915_rc6_ctx_wa_check(i915);
 179                 intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
 180         }
 181
 182         intel_display_power_put(i915, POWER_DOMAIN_GT_IRQ);
 183
 184         intel_runtime_pm_put(i915);
 185
 186         return i915->gt.epoch;
 187 }
 188
 189 void i915_gem_park(struct drm_i915_private *i915)
 190 {
 191         GEM_TRACE("\n");
 192
 193         lockdep_assert_held(&i915->drm.struct_mutex);
 194         GEM_BUG_ON(i915->gt.active_requests);
 195
 196         if (!i915->gt.awake)
 197                 return;
 198
 199         /* Defer the actual call to __i915_gem_park() to prevent ping-pongs */
 200         mod_delayed_work(i915->wq, &i915->gt.idle_work, msecs_to_jiffies(100));
 201 }
 202
 203 void i915_gem_unpark(struct drm_i915_private *i915)
 204 {
 205         GEM_TRACE("\n");
 206
 207         lockdep_assert_held(&i915->drm.struct_mutex);
 208         GEM_BUG_ON(!i915->gt.active_requests);
 209
 210         if (i915->gt.awake)
 211                 return;
 212
 213         intel_runtime_pm_get_noresume(i915);
 214
 215         /*
 216          * It seems that the DMC likes to transition between the DC states a lot
 217          * when there are no connected displays (no active power domains) during
 218          * command submission.
 219          *
 220          * This activity has negative impact on the performance of the chip with
 221          * huge latencies observed in the interrupt handler and elsewhere.
 222          *
 223          * Work around it by grabbing a GT IRQ power domain whilst there is any
 224          * GT activity, preventing any DC state transitions.
 225          */
 226         intel_display_power_get(i915, POWER_DOMAIN_GT_IRQ);
 227
 228         if (NEEDS_RC6_CTX_CORRUPTION_WA(i915))
 229                 intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
 230
 231         i915->gt.awake = true;
 232         if (unlikely(++i915->gt.epoch == 0)) /* keep 0 as invalid */
 233                 i915->gt.epoch = 1;
 234
 235         intel_enable_gt_powersave(i915);
 236         i915_update_gfx_val(i915);
 237         if (INTEL_GEN(i915) >= 6)
 238                 gen6_rps_busy(i915);
 239         i915_pmu_gt_unparked(i915);
 240
 241         intel_engines_unpark(i915);
 242
 243         i915_queue_hangcheck(i915);
 244
 245         queue_delayed_work(i915->wq,
 246                            &i915->gt.retire_work,
 247                            round_jiffies_up_relative(HZ));
 248 }
 249
 250 int
 251 i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
 252                             struct drm_file *file)
 253 {
 254         struct drm_i915_private *dev_priv = to_i915(dev);
 255         struct i915_ggtt *ggtt = &dev_priv->ggtt;
 256         struct drm_i915_gem_get_aperture *args = data;
 257         struct i915_vma *vma;
 258         u64 pinned;
 259
 260         pinned = ggtt->vm.reserved;
 261         mutex_lock(&dev->struct_mutex);
 262         list_for_each_entry(vma, &ggtt->vm.active_list, vm_link)
 263                 if (i915_vma_is_pinned(vma))
 264                         pinned += vma->node.size;
 265         list_for_each_entry(vma, &ggtt->vm.inactive_list, vm_link)
 266                 if (i915_vma_is_pinned(vma))
 267                         pinned += vma->node.size;
 268         mutex_unlock(&dev->struct_mutex);
 269
 270         args->aper_size = ggtt->vm.total;
 271         args->aper_available_size = args->aper_size - pinned;
 272
 273         return 0;
 274 }
 275
 276 static int i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
 277 {
 278         struct address_space *mapping = obj->base.filp->f_mapping;
 279         drm_dma_handle_t *phys;
 280         struct sg_table *st;
 281         struct scatterlist *sg;
 282         char *vaddr;
 283         int i;
 284         int err;
 285
 286         if (WARN_ON(i915_gem_object_needs_bit17_swizzle(obj)))
 287                 return -EINVAL;
 288
 289         /* Always aligning to the object size, allows a single allocation
 290          * to handle all possible callers, and given typical object sizes,
 291          * the alignment of the buddy allocation will naturally match.
 292          */
 293         phys = drm_pci_alloc(obj->base.dev,
 294                              roundup_pow_of_two(obj->base.size),
 295                              roundup_pow_of_two(obj->base.size));
 296         if (!phys)
 297                 return -ENOMEM;
 298
 299         vaddr = phys->vaddr;
 300         for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
 301                 struct page *page;
 302                 char *src;
 303
 304                 page = shmem_read_mapping_page(mapping, i);
 305                 if (IS_ERR(page)) {
 306                         err = PTR_ERR(page);
 307                         goto err_phys;
 308                 }
 309
 310                 src = kmap_atomic(page);
 311                 memcpy(vaddr, src, PAGE_SIZE);
 312                 drm_clflush_virt_range(vaddr, PAGE_SIZE);
 313                 kunmap_atomic(src);
 314
 315                 put_page(page);
 316                 vaddr += PAGE_SIZE;
 317         }
 318
 319         i915_gem_chipset_flush(to_i915(obj->base.dev));
 320
 321         st = kmalloc(sizeof(*st), GFP_KERNEL);
 322         if (!st) {
 323                 err = -ENOMEM;
 324                 goto err_phys;
 325         }
 326
 327         if (sg_alloc_table(st, 1, GFP_KERNEL)) {
 328                 kfree(st);
 329                 err = -ENOMEM;
 330                 goto err_phys;
 331         }
 332
 333         sg = st->sgl;
 334         sg->offset = 0;
 335         sg->length = obj->base.size;
 336
 337         sg_dma_address(sg) = phys->busaddr;
 338         sg_dma_len(sg) = obj->base.size;
 339
 340         obj->phys_handle = phys;
 341
 342         __i915_gem_object_set_pages(obj, st, sg->length);
 343
 344         return 0;
 345
 346 err_phys:
 347         drm_pci_free(obj->base.dev, phys);
 348
 349         return err;
 350 }
 351
 352 static void __start_cpu_write(struct drm_i915_gem_object *obj)
 353 {
 354         obj->read_domains = I915_GEM_DOMAIN_CPU;
 355         obj->write_domain = I915_GEM_DOMAIN_CPU;
 356         if (cpu_write_needs_clflush(obj))
 357                 obj->cache_dirty = true;
 358 }
 359
 360 static void
 361 __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
 362                                 struct sg_table *pages,
 363                                 bool needs_clflush)
 364 {
 365         GEM_BUG_ON(obj->mm.madv == __I915_MADV_PURGED);
 366
 367         if (obj->mm.madv == I915_MADV_DONTNEED)
 368                 obj->mm.dirty = false;
 369
 370         if (needs_clflush &&
 371             (obj->read_domains & I915_GEM_DOMAIN_CPU) == 0 &&
 372             !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ))
 373                 drm_clflush_sg(pages);
 374
 375         __start_cpu_write(obj);
 376 }
 377
 378 static void
 379 i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj,
 380                                struct sg_table *pages)
 381 {
 382         __i915_gem_object_release_shmem(obj, pages, false);
 383
 384         if (obj->mm.dirty) {
 385                 struct address_space *mapping = obj->base.filp->f_mapping;
 386                 char *vaddr = obj->phys_handle->vaddr;
 387                 int i;
 388
 389                 for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
 390                         struct page *page;
 391                         char *dst;
 392
 393                         page = shmem_read_mapping_page(mapping, i);
 394                         if (IS_ERR(page))
 395                                 continue;
 396
 397                         dst = kmap_atomic(page);
 398                         drm_clflush_virt_range(vaddr, PAGE_SIZE);
 399                         memcpy(dst, vaddr, PAGE_SIZE);
 400                         kunmap_atomic(dst);
 401
 402                         set_page_dirty(page);
 403                         if (obj->mm.madv == I915_MADV_WILLNEED)
 404                                 mark_page_accessed(page);
 405                         put_page(page);
 406                         vaddr += PAGE_SIZE;
 407                 }
 408                 obj->mm.dirty = false;
 409         }
 410
 411         sg_free_table(pages);
 412         kfree(pages);
 413
 414         drm_pci_free(obj->base.dev, obj->phys_handle);
 415 }
 416
 417 static void
 418 i915_gem_object_release_phys(struct drm_i915_gem_object *obj)
 419 {
 420         i915_gem_object_unpin_pages(obj);
 421 }
 422
 423 static const struct drm_i915_gem_object_ops i915_gem_phys_ops = {
 424         .get_pages = i915_gem_object_get_pages_phys,
 425         .put_pages = i915_gem_object_put_pages_phys,
 426         .release = i915_gem_object_release_phys,
 427 };
 428
 429 static const struct drm_i915_gem_object_ops i915_gem_object_ops;
 430
 431 int i915_gem_object_unbind(struct drm_i915_gem_object *obj)
 432 {
 433         struct i915_vma *vma;
 434         LIST_HEAD(still_in_list);
 435         int ret;
 436
 437         lockdep_assert_held(&obj->base.dev->struct_mutex);
 438
 439         /* Closed vma are removed from the obj->vma_list - but they may
 440          * still have an active binding on the object. To remove those we
 441          * must wait for all rendering to complete to the object (as unbinding
 442          * must anyway), and retire the requests.
 443          */
 444         ret = i915_gem_object_set_to_cpu_domain(obj, false);
 445         if (ret)
 446                 return ret;
 447
 448         while ((vma = list_first_entry_or_null(&obj->vma_list,
 449                                                struct i915_vma,
 450                                                obj_link))) {
 451                 list_move_tail(&vma->obj_link, &still_in_list);
 452                 ret = i915_vma_unbind(vma);
 453                 if (ret)
 454                         break;
 455         }
 456         list_splice(&still_in_list, &obj->vma_list);
 457
 458         return ret;
 459 }
 460
 461 static long
 462 i915_gem_object_wait_fence(struct dma_fence *fence,
 463                            unsigned int flags,
 464                            long timeout,
 465                            struct intel_rps_client *rps_client)
 466 {
 467         struct i915_request *rq;
 468
 469         BUILD_BUG_ON(I915_WAIT_INTERRUPTIBLE != 0x1);
 470
 471         if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
 472                 return timeout;
 473
 474         if (!dma_fence_is_i915(fence))
 475                 return dma_fence_wait_timeout(fence,
 476                                               flags & I915_WAIT_INTERRUPTIBLE,
 477                                               timeout);
 478
 479         rq = to_request(fence);
 480         if (i915_request_completed(rq))
 481                 goto out;
 482
 483         /*
 484          * This client is about to stall waiting for the GPU. In many cases
 485          * this is undesirable and limits the throughput of the system, as
 486          * many clients cannot continue processing user input/output whilst
 487          * blocked. RPS autotuning may take tens of milliseconds to respond
 488          * to the GPU load and thus incurs additional latency for the client.
 489          * We can circumvent that by promoting the GPU frequency to maximum
 490          * before we wait. This makes the GPU throttle up much more quickly
 491          * (good for benchmarks and user experience, e.g. window animations),
 492          * but at a cost of spending more power processing the workload
 493          * (bad for battery). Not all clients even want their results
 494          * immediately and for them we should just let the GPU select its own
 495          * frequency to maximise efficiency. To prevent a single client from
 496          * forcing the clocks too high for the whole system, we only allow
 497          * each client to waitboost once in a busy period.
 498          */
 499         if (rps_client && !i915_request_started(rq)) {
 500                 if (INTEL_GEN(rq->i915) >= 6)
 501                         gen6_rps_boost(rq, rps_client);
 502         }
 503
 504         timeout = i915_request_wait(rq, flags, timeout);
 505
 506 out:
 507         if (flags & I915_WAIT_LOCKED && i915_request_completed(rq))
 508                 i915_request_retire_upto(rq);
 509
 510         return timeout;
 511 }
 512
 513 static long
 514 i915_gem_object_wait_reservation(struct reservation_object *resv,
 515                                  unsigned int flags,
 516                                  long timeout,
 517                                  struct intel_rps_client *rps_client)
 518 {
 519         unsigned int seq = __read_seqcount_begin(&resv->seq);
 520         struct dma_fence *excl;
 521         bool prune_fences = false;
 522
 523         if (flags & I915_WAIT_ALL) {
 524                 struct dma_fence **shared;
 525                 unsigned int count, i;
 526                 int ret;
 527
 528                 ret = reservation_object_get_fences_rcu(resv,
 529                                                         &excl, &count, &shared);
 530                 if (ret)
 531                         return ret;
 532
 533                 for (i = 0; i < count; i++) {
 534                         timeout = i915_gem_object_wait_fence(shared[i],
 535                                                              flags, timeout,
 536                                                              rps_client);
 537                         if (timeout < 0)
 538                                 break;
 539
 540                         dma_fence_put(shared[i]);
 541                 }
 542
 543                 for (; i < count; i++)
 544                         dma_fence_put(shared[i]);
 545                 kfree(shared);
 546
 547                 /*
 548                  * If both shared fences and an exclusive fence exist,
 549                  * then by construction the shared fences must be later
 550                  * than the exclusive fence. If we successfully wait for
 551                  * all the shared fences, we know that the exclusive fence
 552                  * must all be signaled. If all the shared fences are
 553                  * signaled, we can prune the array and recover the
 554                  * floating references on the fences/requests.
 555                  */
 556                 prune_fences = count && timeout >= 0;
 557         } else {
 558                 excl = reservation_object_get_excl_rcu(resv);
 559         }
 560
 561         if (excl && timeout >= 0)
 562                 timeout = i915_gem_object_wait_fence(excl, flags, timeout,
 563                                                      rps_client);
 564
 565         dma_fence_put(excl);
 566
 567         /*
 568          * Opportunistically prune the fences iff we know they have *all* been
 569          * signaled and that the reservation object has not been changed (i.e.
 570          * no new fences have been added).
 571          */
 572         if (prune_fences && !__read_seqcount_retry(&resv->seq, seq)) {
 573                 if (reservation_object_trylock(resv)) {
 574                         if (!__read_seqcount_retry(&resv->seq, seq))
 575                                 reservation_object_add_excl_fence(resv, NULL);
 576                         reservation_object_unlock(resv);
 577                 }
 578         }
 579
 580         return timeout;
 581 }
 582
 583 static void __fence_set_priority(struct dma_fence *fence,
 584                                  const struct i915_sched_attr *attr)
 585 {
 586         struct i915_request *rq;
 587         struct intel_engine_cs *engine;
 588
 589         if (dma_fence_is_signaled(fence) || !dma_fence_is_i915(fence))
 590                 return;
 591
 592         rq = to_request(fence);
 593         engine = rq->engine;
 594
 595         local_bh_disable();
 596         rcu_read_lock(); /* RCU serialisation for set-wedged protection */
 597         if (engine->schedule)
 598                 engine->schedule(rq, attr);
 599         rcu_read_unlock();
 600         local_bh_enable(); /* kick the tasklets if queues were reprioritised */
 601 }
 602
 603 static void fence_set_priority(struct dma_fence *fence,
 604                                const struct i915_sched_attr *attr)
 605 {
 606         /* Recurse once into a fence-array */
 607         if (dma_fence_is_array(fence)) {
 608                 struct dma_fence_array *array = to_dma_fence_array(fence);
 609                 int i;
 610
 611                 for (i = 0; i < array->num_fences; i++)
 612                         __fence_set_priority(array->fences[i], attr);
 613         } else {
 614                 __fence_set_priority(fence, attr);
 615         }
 616 }
 617
 618 int
 619 i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
 620                               unsigned int flags,
 621                               const struct i915_sched_attr *attr)
 622 {
 623         struct dma_fence *excl;
 624
 625         if (flags & I915_WAIT_ALL) {
 626                 struct dma_fence **shared;
 627                 unsigned int count, i;
 628                 int ret;
 629
 630                 ret = reservation_object_get_fences_rcu(obj->resv,
 631                                                         &excl, &count, &shared);
 632                 if (ret)
 633                         return ret;
 634
 635                 for (i = 0; i < count; i++) {
 636                         fence_set_priority(shared[i], attr);
 637                         dma_fence_put(shared[i]);
 638                 }
 639
 640                 kfree(shared);
 641         } else {
 642                 excl = reservation_object_get_excl_rcu(obj->resv);
 643         }
 644
 645         if (excl) {
 646                 fence_set_priority(excl, attr);
 647                 dma_fence_put(excl);
 648         }
 649         return 0;
 650 }
 651
 652 /**
 653  * Waits for rendering to the object to be completed
 654  * @obj: i915 gem object
 655  * @flags: how to wait (under a lock, for all rendering or just for writes etc)
 656  * @timeout: how long to wait
 657  * @rps_client: client (user process) to charge for any waitboosting
 658  */
 659 int
 660 i915_gem_object_wait(struct drm_i915_gem_object *obj,
 661                      unsigned int flags,
 662                      long timeout,
 663                      struct intel_rps_client *rps_client)
 664 {
 665         might_sleep();
 666 #if IS_ENABLED(CONFIG_LOCKDEP)
 667         GEM_BUG_ON(debug_locks &&
 668                    !!lockdep_is_held(&obj->base.dev->struct_mutex) !=
 669                    !!(flags & I915_WAIT_LOCKED));
 670 #endif
 671         GEM_BUG_ON(timeout < 0);
 672
 673         timeout = i915_gem_object_wait_reservation(obj->resv,
 674                                                    flags, timeout,
 675                                                    rps_client);
 676         return timeout < 0 ? timeout : 0;
 677 }
 678
 679 static struct intel_rps_client *to_rps_client(struct drm_file *file)
 680 {
 681         struct drm_i915_file_private *fpriv = file->driver_priv;
 682
 683         return &fpriv->rps_client;
 684 }
 685
 686 static int
 687 i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
 688                      struct drm_i915_gem_pwrite *args,
 689                      struct drm_file *file)
 690 {
 691         void *vaddr = obj->phys_handle->vaddr + args->offset;
 692         char __user *user_data = u64_to_user_ptr(args->data_ptr);
 693
 694         /* We manually control the domain here and pretend that it
 695          * remains coherent i.e. in the GTT domain, like shmem_pwrite.
 696          */
 697         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
 698         if (copy_from_user(vaddr, user_data, args->size))
 699                 return -EFAULT;
 700
 701         drm_clflush_virt_range(vaddr, args->size);
 702         i915_gem_chipset_flush(to_i915(obj->base.dev));
 703
 704         intel_fb_obj_flush(obj, ORIGIN_CPU);
 705         return 0;
 706 }
 707
 708 void *i915_gem_object_alloc(struct drm_i915_private *dev_priv)
 709 {
 710         return kmem_cache_zalloc(dev_priv->objects, GFP_KERNEL);
 711 }
 712
 713 void i915_gem_object_free(struct drm_i915_gem_object *obj)
 714 {
 715         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 716         kmem_cache_free(dev_priv->objects, obj);
 717 }
 718
 719 static int
 720 i915_gem_create(struct drm_file *file,
 721                 struct drm_i915_private *dev_priv,
 722                 uint64_t size,
 723                 uint32_t *handle_p)
 724 {
 725         struct drm_i915_gem_object *obj;
 726         int ret;
 727         u32 handle;
 728
 729         size = roundup(size, PAGE_SIZE);
 730         if (size == 0)
 731                 return -EINVAL;
 732
 733         /* Allocate the new object */
 734         obj = i915_gem_object_create(dev_priv, size);
 735         if (IS_ERR(obj))
 736                 return PTR_ERR(obj);
 737
 738         ret = drm_gem_handle_create(file, &obj->base, &handle);
 739         /* drop reference from allocate - handle holds it now */
 740         i915_gem_object_put(obj);
 741         if (ret)
 742                 return ret;
 743
 744         *handle_p = handle;
 745         return 0;
 746 }
 747
 748 int
 749 i915_gem_dumb_create(struct drm_file *file,
 750                      struct drm_device *dev,
 751                      struct drm_mode_create_dumb *args)
 752 {
 753         /* have to work out size/pitch and return them */
 754         args->pitch = ALIGN(args->width * DIV_ROUND_UP(args->bpp, 8), 64);
 755         args->size = args->pitch * args->height;
 756         return i915_gem_create(file, to_i915(dev),
 757                                args->size, &args->handle);
 758 }
 759
 760 static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
 761 {
 762         return !(obj->cache_level == I915_CACHE_NONE ||
 763                  obj->cache_level == I915_CACHE_WT);
 764 }
 765
 766 /**
 767  * Creates a new mm object and returns a handle to it.
 768  * @dev: drm device pointer
 769  * @data: ioctl data blob
 770  * @file: drm file pointer
 771  */
 772 int
 773 i915_gem_create_ioctl(struct drm_device *dev, void *data,
 774                       struct drm_file *file)
 775 {
 776         struct drm_i915_private *dev_priv = to_i915(dev);
 777         struct drm_i915_gem_create *args = data;
 778
 779         i915_gem_flush_free_objects(dev_priv);
 780
 781         return i915_gem_create(file, dev_priv,
 782                                args->size, &args->handle);
 783 }
 784
 785 static inline enum fb_op_origin
 786 fb_write_origin(struct drm_i915_gem_object *obj, unsigned int domain)
 787 {
 788         return (domain == I915_GEM_DOMAIN_GTT ?
 789                 obj->frontbuffer_ggtt_origin : ORIGIN_CPU);
 790 }
 791
 792 void i915_gem_flush_ggtt_writes(struct drm_i915_private *dev_priv)
 793 {
 794         /*
 795          * No actual flushing is required for the GTT write domain for reads
 796          * from the GTT domain. Writes to it "immediately" go to main memory
 797          * as far as we know, so there's no chipset flush. It also doesn't
 798          * land in the GPU render cache.
 799          *
 800          * However, we do have to enforce the order so that all writes through
 801          * the GTT land before any writes to the device, such as updates to
 802          * the GATT itself.
 803          *
 804          * We also have to wait a bit for the writes to land from the GTT.
 805          * An uncached read (i.e. mmio) seems to be ideal for the round-trip
 806          * timing. This issue has only been observed when switching quickly
 807          * between GTT writes and CPU reads from inside the kernel on recent hw,
 808          * and it appears to only affect discrete GTT blocks (i.e. on LLC
 809          * system agents we cannot reproduce this behaviour, until Cannonlake
 810          * that was!).
 811          */
 812
 813         i915_gem_chipset_flush(dev_priv);
 814
 815         intel_runtime_pm_get(dev_priv);
 816         spin_lock_irq(&dev_priv->uncore.lock);
 817
 818         POSTING_READ_FW(RING_HEAD(RENDER_RING_BASE));
 819
 820         spin_unlock_irq(&dev_priv->uncore.lock);
 821         intel_runtime_pm_put(dev_priv);
 822 }
 823
 824 static void
 825 flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
 826 {
 827         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 828         struct i915_vma *vma;
 829
 830         if (!(obj->write_domain & flush_domains))
 831                 return;
 832
 833         switch (obj->write_domain) {
 834         case I915_GEM_DOMAIN_GTT:
 835                 i915_gem_flush_ggtt_writes(dev_priv);
 836
 837                 intel_fb_obj_flush(obj,
 838                                    fb_write_origin(obj, I915_GEM_DOMAIN_GTT));
 839
 840                 for_each_ggtt_vma(vma, obj) {
 841                         if (vma->iomap)
 842                                 continue;
 843
 844                         i915_vma_unset_ggtt_write(vma);
 845                 }
 846                 break;
 847
 848         case I915_GEM_DOMAIN_WC:
 849                 wmb();
 850                 break;
 851
 852         case I915_GEM_DOMAIN_CPU:
 853                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
 854                 break;
 855
 856         case I915_GEM_DOMAIN_RENDER:
 857                 if (gpu_write_needs_clflush(obj))
 858                         obj->cache_dirty = true;
 859                 break;
 860         }
 861
 862         obj->write_domain = 0;
 863 }
 864
 865 static inline int
 866 __copy_to_user_swizzled(char __user *cpu_vaddr,
 867                         const char *gpu_vaddr, int gpu_offset,
 868                         int length)
 869 {
 870         int ret, cpu_offset = 0;
 871
 872         while (length > 0) {
 873                 int cacheline_end = ALIGN(gpu_offset + 1, 64);
 874                 int this_length = min(cacheline_end - gpu_offset, length);
 875                 int swizzled_gpu_offset = gpu_offset ^ 64;
 876
 877                 ret = __copy_to_user(cpu_vaddr + cpu_offset,
 878                                      gpu_vaddr + swizzled_gpu_offset,
 879                                      this_length);
 880                 if (ret)
 881                         return ret + length;
 882
 883                 cpu_offset += this_length;
 884                 gpu_offset += this_length;
 885                 length -= this_length;
 886         }
 887
 888         return 0;
 889 }
 890
 891 static inline int
 892 __copy_from_user_swizzled(char *gpu_vaddr, int gpu_offset,
 893                           const char __user *cpu_vaddr,
 894                           int length)
 895 {
 896         int ret, cpu_offset = 0;
 897
 898         while (length > 0) {
 899                 int cacheline_end = ALIGN(gpu_offset + 1, 64);
 900                 int this_length = min(cacheline_end - gpu_offset, length);
 901                 int swizzled_gpu_offset = gpu_offset ^ 64;
 902
 903                 ret = __copy_from_user(gpu_vaddr + swizzled_gpu_offset,
 904                                        cpu_vaddr + cpu_offset,
 905                                        this_length);
 906                 if (ret)
 907                         return ret + length;
 908
 909                 cpu_offset += this_length;
 910                 gpu_offset += this_length;
 911                 length -= this_length;
 912         }
 913
 914         return 0;
 915 }
 916
 917 /*
 918  * Pins the specified object's pages and synchronizes the object with
 919  * GPU accesses. Sets needs_clflush to non-zero if the caller should
 920  * flush the object from the CPU cache.
 921  */
 922 int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
 923                                     unsigned int *needs_clflush)
 924 {
 925         int ret;
 926
 927         lockdep_assert_held(&obj->base.dev->struct_mutex);
 928
 929         *needs_clflush = 0;
 930         if (!i915_gem_object_has_struct_page(obj))
 931                 return -ENODEV;
 932
 933         ret = i915_gem_object_wait(obj,
 934                                    I915_WAIT_INTERRUPTIBLE |
 935                                    I915_WAIT_LOCKED,
 936                                    MAX_SCHEDULE_TIMEOUT,
 937                                    NULL);
 938         if (ret)
 939                 return ret;
 940
 941         ret = i915_gem_object_pin_pages(obj);
 942         if (ret)
 943                 return ret;
 944
 945         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ ||
 946             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
 947                 ret = i915_gem_object_set_to_cpu_domain(obj, false);
 948                 if (ret)
 949                         goto err_unpin;
 950                 else
 951                         goto out;
 952         }
 953
 954         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
 955
 956         /* If we're not in the cpu read domain, set ourself into the gtt
 957          * read domain and manually flush cachelines (if required). This
 958          * optimizes for the case when the gpu will dirty the data
 959          * anyway again before the next pread happens.
 960          */
 961         if (!obj->cache_dirty &&
 962             !(obj->read_domains & I915_GEM_DOMAIN_CPU))
 963                 *needs_clflush = CLFLUSH_BEFORE;
 964
 965 out:
 966         /* return with the pages pinned */
 967         return 0;
 968
 969 err_unpin:
 970         i915_gem_object_unpin_pages(obj);
 971         return ret;
 972 }
 973
 974 int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
 975                                      unsigned int *needs_clflush)
 976 {
 977         int ret;
 978
 979         lockdep_assert_held(&obj->base.dev->struct_mutex);
 980
 981         *needs_clflush = 0;
 982         if (!i915_gem_object_has_struct_page(obj))
 983                 return -ENODEV;
 984
 985         ret = i915_gem_object_wait(obj,
 986                                    I915_WAIT_INTERRUPTIBLE |
 987                                    I915_WAIT_LOCKED |
 988                                    I915_WAIT_ALL,
 989                                    MAX_SCHEDULE_TIMEOUT,
 990                                    NULL);
 991         if (ret)
 992                 return ret;
 993
 994         ret = i915_gem_object_pin_pages(obj);
 995         if (ret)
 996                 return ret;
 997
 998         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE ||
 999             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
1000                 ret = i915_gem_object_set_to_cpu_domain(obj, true);
1001                 if (ret)
1002                         goto err_unpin;
1003                 else
1004                         goto out;
1005         }
1006
1007         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
1008
1009         /* If we're not in the cpu write domain, set ourself into the
1010          * gtt write domain and manually flush cachelines (as required).
1011          * This optimizes for the case when the gpu will use the data
1012          * right away and we therefore have to clflush anyway.
1013          */
1014         if (!obj->cache_dirty) {
1015                 *needs_clflush |= CLFLUSH_AFTER;
1016
1017                 /*
1018                  * Same trick applies to invalidate partially written
1019                  * cachelines read before writing.
1020                  */
1021                 if (!(obj->read_domains & I915_GEM_DOMAIN_CPU))
1022                         *needs_clflush |= CLFLUSH_BEFORE;
1023         }
1024
1025 out:
1026         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1027         obj->mm.dirty = true;
1028         /* return with the pages pinned */
1029         return 0;
1030
1031 err_unpin:
1032         i915_gem_object_unpin_pages(obj);
1033         return ret;
1034 }
1035
1036 static void
1037 shmem_clflush_swizzled_range(char *addr, unsigned long length,
1038                              bool swizzled)
1039 {
1040         if (unlikely(swizzled)) {
1041                 unsigned long start = (unsigned long) addr;
1042                 unsigned long end = (unsigned long) addr + length;
1043
1044                 /* For swizzling simply ensure that we always flush both
1045                  * channels. Lame, but simple and it works. Swizzled
1046                  * pwrite/pread is far from a hotpath - current userspace
1047                  * doesn't use it at all. */
1048                 start = round_down(start, 128);
1049                 end = round_up(end, 128);
1050
1051                 drm_clflush_virt_range((void *)start, end - start);
1052         } else {
1053                 drm_clflush_virt_range(addr, length);
1054         }
1055
1056 }
1057
1058 /* Only difference to the fast-path function is that this can handle bit17
1059  * and uses non-atomic copy and kmap functions. */
1060 static int
1061 shmem_pread_slow(struct page *page, int offset, int length,
1062                  char __user *user_data,
1063                  bool page_do_bit17_swizzling, bool needs_clflush)
1064 {
1065         char *vaddr;
1066         int ret;
1067
1068         vaddr = kmap(page);
1069         if (needs_clflush)
1070                 shmem_clflush_swizzled_range(vaddr + offset, length,
1071                                              page_do_bit17_swizzling);
1072
1073         if (page_do_bit17_swizzling)
1074                 ret = __copy_to_user_swizzled(user_data, vaddr, offset, length);
1075         else
1076                 ret = __copy_to_user(user_data, vaddr + offset, length);
1077         kunmap(page);
1078
1079         return ret ? - EFAULT : 0;
1080 }
1081
1082 static int
1083 shmem_pread(struct page *page, int offset, int length, char __user *user_data,
1084             bool page_do_bit17_swizzling, bool needs_clflush)
1085 {
1086         int ret;
1087
1088         ret = -ENODEV;
1089         if (!page_do_bit17_swizzling) {
1090                 char *vaddr = kmap_atomic(page);
1091
1092                 if (needs_clflush)
1093                         drm_clflush_virt_range(vaddr + offset, length);
1094                 ret = __copy_to_user_inatomic(user_data, vaddr + offset, length);
1095                 kunmap_atomic(vaddr);
1096         }
1097         if (ret == 0)
1098                 return 0;
1099
1100         return shmem_pread_slow(page, offset, length, user_data,
1101                                 page_do_bit17_swizzling, needs_clflush);
1102 }
1103
1104 static int
1105 i915_gem_shmem_pread(struct drm_i915_gem_object *obj,
1106                      struct drm_i915_gem_pread *args)
1107 {
1108         char __user *user_data;
1109         u64 remain;
1110         unsigned int obj_do_bit17_swizzling;
1111         unsigned int needs_clflush;
1112         unsigned int idx, offset;
1113         int ret;
1114
1115         obj_do_bit17_swizzling = 0;
1116         if (i915_gem_object_needs_bit17_swizzle(obj))
1117                 obj_do_bit17_swizzling = BIT(17);
1118
1119         ret = mutex_lock_interruptible(&obj->base.dev->struct_mutex);
1120         if (ret)
1121                 return ret;
1122
1123         ret = i915_gem_obj_prepare_shmem_read(obj, &needs_clflush);
1124         mutex_unlock(&obj->base.dev->struct_mutex);
1125         if (ret)
1126                 return ret;
1127
1128         remain = args->size;
1129         user_data = u64_to_user_ptr(args->data_ptr);
1130         offset = offset_in_page(args->offset);
1131         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1132                 struct page *page = i915_gem_object_get_page(obj, idx);
1133                 unsigned int length = min_t(u64, remain, PAGE_SIZE - offset);
1134
1135                 ret = shmem_pread(page, offset, length, user_data,
1136                                   page_to_phys(page) & obj_do_bit17_swizzling,
1137                                   needs_clflush);
1138                 if (ret)
1139                         break;
1140
1141                 remain -= length;
1142                 user_data += length;
1143                 offset = 0;
1144         }
1145
1146         i915_gem_obj_finish_shmem_access(obj);
1147         return ret;
1148 }
1149
1150 static inline bool
1151 gtt_user_read(struct io_mapping *mapping,
1152               loff_t base, int offset,
1153               char __user *user_data, int length)
1154 {
1155         void __iomem *vaddr;
1156         unsigned long unwritten;
1157
1158         /* We can use the cpu mem copy function because this is X86. */
1159         vaddr = io_mapping_map_atomic_wc(mapping, base);
1160         unwritten = __copy_to_user_inatomic(user_data,
1161                                             (void __force *)vaddr + offset,
1162                                             length);
1163         io_mapping_unmap_atomic(vaddr);
1164         if (unwritten) {
1165                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1166                 unwritten = copy_to_user(user_data,
1167                                          (void __force *)vaddr + offset,
1168                                          length);
1169                 io_mapping_unmap(vaddr);
1170         }
1171         return unwritten;
1172 }
1173
1174 static int
1175 i915_gem_gtt_pread(struct drm_i915_gem_object *obj,
1176                    const struct drm_i915_gem_pread *args)
1177 {
1178         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1179         struct i915_ggtt *ggtt = &i915->ggtt;
1180         struct drm_mm_node node;
1181         struct i915_vma *vma;
1182         void __user *user_data;
1183         u64 remain, offset;
1184         int ret;
1185
1186         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1187         if (ret)
1188                 return ret;
1189
1190         intel_runtime_pm_get(i915);
1191         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1192                                        PIN_MAPPABLE |
1193                                        PIN_NONFAULT |
1194                                        PIN_NONBLOCK);
1195         if (!IS_ERR(vma)) {
1196                 node.start = i915_ggtt_offset(vma);
1197                 node.allocated = false;
1198                 ret = i915_vma_put_fence(vma);
1199                 if (ret) {
1200                         i915_vma_unpin(vma);
1201                         vma = ERR_PTR(ret);
1202                 }
1203         }
1204         if (IS_ERR(vma)) {
1205                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1206                 if (ret)
1207                         goto out_unlock;
1208                 GEM_BUG_ON(!node.allocated);
1209         }
1210
1211         ret = i915_gem_object_set_to_gtt_domain(obj, false);
1212         if (ret)
1213                 goto out_unpin;
1214
1215         mutex_unlock(&i915->drm.struct_mutex);
1216
1217         user_data = u64_to_user_ptr(args->data_ptr);
1218         remain = args->size;
1219         offset = args->offset;
1220
1221         while (remain > 0) {
1222                 /* Operation in this page
1223                  *
1224                  * page_base = page offset within aperture
1225                  * page_offset = offset within page
1226                  * page_length = bytes to copy for this page
1227                  */
1228                 u32 page_base = node.start;
1229                 unsigned page_offset = offset_in_page(offset);
1230                 unsigned page_length = PAGE_SIZE - page_offset;
1231                 page_length = remain < page_length ? remain : page_length;
1232                 if (node.allocated) {
1233                         wmb();
1234                         ggtt->vm.insert_page(&ggtt->vm,
1235                                              i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1236                                              node.start, I915_CACHE_NONE, 0);
1237                         wmb();
1238                 } else {
1239                         page_base += offset & PAGE_MASK;
1240                 }
1241
1242                 if (gtt_user_read(&ggtt->iomap, page_base, page_offset,
1243                                   user_data, page_length)) {
1244                         ret = -EFAULT;
1245                         break;
1246                 }
1247
1248                 remain -= page_length;
1249                 user_data += page_length;
1250                 offset += page_length;
1251         }
1252
1253         mutex_lock(&i915->drm.struct_mutex);
1254 out_unpin:
1255         if (node.allocated) {
1256                 wmb();
1257                 ggtt->vm.clear_range(&ggtt->vm, node.start, node.size);
1258                 remove_mappable_node(&node);
1259         } else {
1260                 i915_vma_unpin(vma);
1261         }
1262 out_unlock:
1263         intel_runtime_pm_put(i915);
1264         mutex_unlock(&i915->drm.struct_mutex);
1265
1266         return ret;
1267 }
1268
1269 /**
1270  * Reads data from the object referenced by handle.
1271  * @dev: drm device pointer
1272  * @data: ioctl data blob
1273  * @file: drm file pointer
1274  *
1275  * On error, the contents of *data are undefined.
1276  */
1277 int
1278 i915_gem_pread_ioctl(struct drm_device *dev, void *data,
1279                      struct drm_file *file)
1280 {
1281         struct drm_i915_gem_pread *args = data;
1282         struct drm_i915_gem_object *obj;
1283         int ret;
1284
1285         if (args->size == 0)
1286                 return 0;
1287
1288         if (!access_ok(VERIFY_WRITE,
1289                        u64_to_user_ptr(args->data_ptr),
1290                        args->size))
1291                 return -EFAULT;
1292
1293         obj = i915_gem_object_lookup(file, args->handle);
1294         if (!obj)
1295                 return -ENOENT;
1296
1297         /* Bounds check source.  */
1298         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1299                 ret = -EINVAL;
1300                 goto out;
1301         }
1302
1303         trace_i915_gem_object_pread(obj, args->offset, args->size);
1304
1305         ret = i915_gem_object_wait(obj,
1306                                    I915_WAIT_INTERRUPTIBLE,
1307                                    MAX_SCHEDULE_TIMEOUT,
1308                                    to_rps_client(file));
1309         if (ret)
1310                 goto out;
1311
1312         ret = i915_gem_object_pin_pages(obj);
1313         if (ret)
1314                 goto out;
1315
1316         ret = i915_gem_shmem_pread(obj, args);
1317         if (ret == -EFAULT || ret == -ENODEV)
1318                 ret = i915_gem_gtt_pread(obj, args);
1319
1320         i915_gem_object_unpin_pages(obj);
1321 out:
1322         i915_gem_object_put(obj);
1323         return ret;
1324 }
1325
1326 /* This is the fast write path which cannot handle
1327  * page faults in the source data
1328  */
1329
1330 static inline bool
1331 ggtt_write(struct io_mapping *mapping,
1332            loff_t base, int offset,
1333            char __user *user_data, int length)
1334 {
1335         void __iomem *vaddr;
1336         unsigned long unwritten;
1337
1338         /* We can use the cpu mem copy function because this is X86. */
1339         vaddr = io_mapping_map_atomic_wc(mapping, base);
1340         unwritten = __copy_from_user_inatomic_nocache((void __force *)vaddr + offset,
1341                                                       user_data, length);
1342         io_mapping_unmap_atomic(vaddr);
1343         if (unwritten) {
1344                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1345                 unwritten = copy_from_user((void __force *)vaddr + offset,
1346                                            user_data, length);
1347                 io_mapping_unmap(vaddr);
1348         }
1349
1350         return unwritten;
1351 }
1352
1353 /**
1354  * This is the fast pwrite path, where we copy the data directly from the
1355  * user into the GTT, uncached.
1356  * @obj: i915 GEM object
1357  * @args: pwrite arguments structure
1358  */
1359 static int
1360 i915_gem_gtt_pwrite_fast(struct drm_i915_gem_object *obj,
1361                          const struct drm_i915_gem_pwrite *args)
1362 {
1363         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1364         struct i915_ggtt *ggtt = &i915->ggtt;
1365         struct drm_mm_node node;
1366         struct i915_vma *vma;
1367         u64 remain, offset;
1368         void __user *user_data;
1369         int ret;
1370
1371         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1372         if (ret)
1373                 return ret;
1374
1375         if (i915_gem_object_has_struct_page(obj)) {
1376                 /*
1377                  * Avoid waking the device up if we can fallback, as
1378                  * waking/resuming is very slow (worst-case 10-100 ms
1379                  * depending on PCI sleeps and our own resume time).
1380                  * This easily dwarfs any performance advantage from
1381                  * using the cache bypass of indirect GGTT access.
1382                  */
1383                 if (!intel_runtime_pm_get_if_in_use(i915)) {
1384                         ret = -EFAULT;
1385                         goto out_unlock;
1386                 }
1387         } else {
1388                 /* No backing pages, no fallback, we must force GGTT access */
1389                 intel_runtime_pm_get(i915);
1390         }
1391
1392         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1393                                        PIN_MAPPABLE |
1394                                        PIN_NONFAULT |
1395                                        PIN_NONBLOCK);
1396         if (!IS_ERR(vma)) {
1397                 node.start = i915_ggtt_offset(vma);
1398                 node.allocated = false;
1399                 ret = i915_vma_put_fence(vma);
1400                 if (ret) {
1401                         i915_vma_unpin(vma);
1402                         vma = ERR_PTR(ret);
1403                 }
1404         }
1405         if (IS_ERR(vma)) {
1406                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1407                 if (ret)
1408                         goto out_rpm;
1409                 GEM_BUG_ON(!node.allocated);
1410         }
1411
1412         ret = i915_gem_object_set_to_gtt_domain(obj, true);
1413         if (ret)
1414                 goto out_unpin;
1415
1416         mutex_unlock(&i915->drm.struct_mutex);
1417
1418         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1419
1420         user_data = u64_to_user_ptr(args->data_ptr);
1421         offset = args->offset;
1422         remain = args->size;
1423         while (remain) {
1424                 /* Operation in this page
1425                  *
1426                  * page_base = page offset within aperture
1427                  * page_offset = offset within page
1428                  * page_length = bytes to copy for this page
1429                  */
1430                 u32 page_base = node.start;
1431                 unsigned int page_offset = offset_in_page(offset);
1432                 unsigned int page_length = PAGE_SIZE - page_offset;
1433                 page_length = remain < page_length ? remain : page_length;
1434                 if (node.allocated) {
1435                         wmb(); /* flush the write before we modify the GGTT */
1436                         ggtt->vm.insert_page(&ggtt->vm,
1437                                              i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1438                                              node.start, I915_CACHE_NONE, 0);
1439                         wmb(); /* flush modifications to the GGTT (insert_page) */
1440                 } else {
1441                         page_base += offset & PAGE_MASK;
1442                 }
1443                 /* If we get a fault while copying data, then (presumably) our
1444                  * source page isn't available.  Return the error and we'll
1445                  * retry in the slow path.
1446                  * If the object is non-shmem backed, we retry again with the
1447                  * path that handles page fault.
1448                  */
1449                 if (ggtt_write(&ggtt->iomap, page_base, page_offset,
1450                                user_data, page_length)) {
1451                         ret = -EFAULT;
1452                         break;
1453                 }
1454
1455                 remain -= page_length;
1456                 user_data += page_length;
1457                 offset += page_length;
1458         }
1459         intel_fb_obj_flush(obj, ORIGIN_CPU);
1460
1461         mutex_lock(&i915->drm.struct_mutex);
1462 out_unpin:
1463         if (node.allocated) {
1464                 wmb();
1465                 ggtt->vm.clear_range(&ggtt->vm, node.start, node.size);
1466                 remove_mappable_node(&node);
1467         } else {
1468                 i915_vma_unpin(vma);
1469         }
1470 out_rpm:
1471         intel_runtime_pm_put(i915);
1472 out_unlock:
1473         mutex_unlock(&i915->drm.struct_mutex);
1474         return ret;
1475 }
1476
1477 static int
1478 shmem_pwrite_slow(struct page *page, int offset, int length,
1479                   char __user *user_data,
1480                   bool page_do_bit17_swizzling,
1481                   bool needs_clflush_before,
1482                   bool needs_clflush_after)
1483 {
1484         char *vaddr;
1485         int ret;
1486
1487         vaddr = kmap(page);
1488         if (unlikely(needs_clflush_before || page_do_bit17_swizzling))
1489                 shmem_clflush_swizzled_range(vaddr + offset, length,
1490                                              page_do_bit17_swizzling);
1491         if (page_do_bit17_swizzling)
1492                 ret = __copy_from_user_swizzled(vaddr, offset, user_data,
1493                                                 length);
1494         else
1495                 ret = __copy_from_user(vaddr + offset, user_data, length);
1496         if (needs_clflush_after)
1497                 shmem_clflush_swizzled_range(vaddr + offset, length,
1498                                              page_do_bit17_swizzling);
1499         kunmap(page);
1500
1501         return ret ? -EFAULT : 0;
1502 }
1503
1504 /* Per-page copy function for the shmem pwrite fastpath.
1505  * Flushes invalid cachelines before writing to the target if
1506  * needs_clflush_before is set and flushes out any written cachelines after
1507  * writing if needs_clflush is set.
1508  */
1509 static int
1510 shmem_pwrite(struct page *page, int offset, int len, char __user *user_data,
1511              bool page_do_bit17_swizzling,
1512              bool needs_clflush_before,
1513              bool needs_clflush_after)
1514 {
1515         int ret;
1516
1517         ret = -ENODEV;
1518         if (!page_do_bit17_swizzling) {
1519                 char *vaddr = kmap_atomic(page);
1520
1521                 if (needs_clflush_before)
1522                         drm_clflush_virt_range(vaddr + offset, len);
1523                 ret = __copy_from_user_inatomic(vaddr + offset, user_data, len);
1524                 if (needs_clflush_after)
1525                         drm_clflush_virt_range(vaddr + offset, len);
1526
1527                 kunmap_atomic(vaddr);
1528         }
1529         if (ret == 0)
1530                 return ret;
1531
1532         return shmem_pwrite_slow(page, offset, len, user_data,
1533                                  page_do_bit17_swizzling,
1534                                  needs_clflush_before,
1535                                  needs_clflush_after);
1536 }
1537
1538 static int
1539 i915_gem_shmem_pwrite(struct drm_i915_gem_object *obj,
1540                       const struct drm_i915_gem_pwrite *args)
1541 {
1542         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1543         void __user *user_data;
1544         u64 remain;
1545         unsigned int obj_do_bit17_swizzling;
1546         unsigned int partial_cacheline_write;
1547         unsigned int needs_clflush;
1548         unsigned int offset, idx;
1549         int ret;
1550
1551         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1552         if (ret)
1553                 return ret;
1554
1555         ret = i915_gem_obj_prepare_shmem_write(obj, &needs_clflush);
1556         mutex_unlock(&i915->drm.struct_mutex);
1557         if (ret)
1558                 return ret;
1559
1560         obj_do_bit17_swizzling = 0;
1561         if (i915_gem_object_needs_bit17_swizzle(obj))
1562                 obj_do_bit17_swizzling = BIT(17);
1563
1564         /* If we don't overwrite a cacheline completely we need to be
1565          * careful to have up-to-date data by first clflushing. Don't
1566          * overcomplicate things and flush the entire patch.
1567          */
1568         partial_cacheline_write = 0;
1569         if (needs_clflush & CLFLUSH_BEFORE)
1570                 partial_cacheline_write = boot_cpu_data.x86_clflush_size - 1;
1571
1572         user_data = u64_to_user_ptr(args->data_ptr);
1573         remain = args->size;
1574         offset = offset_in_page(args->offset);
1575         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1576                 struct page *page = i915_gem_object_get_page(obj, idx);
1577                 unsigned int length = min_t(u64, remain, PAGE_SIZE - offset);
1578
1579                 ret = shmem_pwrite(page, offset, length, user_data,
1580                                    page_to_phys(page) & obj_do_bit17_swizzling,
1581                                    (offset | length) & partial_cacheline_write,
1582                                    needs_clflush & CLFLUSH_AFTER);
1583                 if (ret)
1584                         break;
1585
1586                 remain -= length;
1587                 user_data += length;
1588                 offset = 0;
1589         }
1590
1591         intel_fb_obj_flush(obj, ORIGIN_CPU);
1592         i915_gem_obj_finish_shmem_access(obj);
1593         return ret;
1594 }
1595
1596 /**
1597  * Writes data to the object referenced by handle.
1598  * @dev: drm device
1599  * @data: ioctl data blob
1600  * @file: drm file
1601  *
1602  * On error, the contents of the buffer that were to be modified are undefined.
1603  */
1604 int
1605 i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
1606                       struct drm_file *file)
1607 {
1608         struct drm_i915_gem_pwrite *args = data;
1609         struct drm_i915_gem_object *obj;
1610         int ret;
1611
1612         if (args->size == 0)
1613                 return 0;
1614
1615         if (!access_ok(VERIFY_READ,
1616                        u64_to_user_ptr(args->data_ptr),
1617                        args->size))
1618                 return -EFAULT;
1619
1620         obj = i915_gem_object_lookup(file, args->handle);
1621         if (!obj)
1622                 return -ENOENT;
1623
1624         /* Bounds check destination. */
1625         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1626                 ret = -EINVAL;
1627                 goto err;
1628         }
1629
1630         /* Writes not allowed into this read-only object */
1631         if (i915_gem_object_is_readonly(obj)) {
1632                 ret = -EINVAL;
1633                 goto err;
1634         }
1635
1636         trace_i915_gem_object_pwrite(obj, args->offset, args->size);
1637
1638         ret = -ENODEV;
1639         if (obj->ops->pwrite)
1640                 ret = obj->ops->pwrite(obj, args);
1641         if (ret != -ENODEV)
1642                 goto err;
1643
1644         ret = i915_gem_object_wait(obj,
1645                                    I915_WAIT_INTERRUPTIBLE |
1646                                    I915_WAIT_ALL,
1647                                    MAX_SCHEDULE_TIMEOUT,
1648                                    to_rps_client(file));
1649         if (ret)
1650                 goto err;
1651
1652         ret = i915_gem_object_pin_pages(obj);
1653         if (ret)
1654                 goto err;
1655
1656         ret = -EFAULT;
1657         /* We can only do the GTT pwrite on untiled buffers, as otherwise
1658          * it would end up going through the fenced access, and we'll get
1659          * different detiling behavior between reading and writing.
1660          * pread/pwrite currently are reading and writing from the CPU
1661          * perspective, requiring manual detiling by the client.
1662          */
1663         if (!i915_gem_object_has_struct_page(obj) ||
1664             cpu_write_needs_clflush(obj))
1665                 /* Note that the gtt paths might fail with non-page-backed user
1666                  * pointers (e.g. gtt mappings when moving data between
1667                  * textures). Fallback to the shmem path in that case.
1668                  */
1669                 ret = i915_gem_gtt_pwrite_fast(obj, args);
1670
1671         if (ret == -EFAULT || ret == -ENOSPC) {
1672                 if (obj->phys_handle)
1673                         ret = i915_gem_phys_pwrite(obj, args, file);
1674                 else
1675                         ret = i915_gem_shmem_pwrite(obj, args);
1676         }
1677
1678         i915_gem_object_unpin_pages(obj);
1679 err:
1680         i915_gem_object_put(obj);
1681         return ret;
1682 }
1683
1684 static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
1685 {
1686         struct drm_i915_private *i915;
1687         struct list_head *list;
1688         struct i915_vma *vma;
1689
1690         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
1691
1692         for_each_ggtt_vma(vma, obj) {
1693                 if (i915_vma_is_active(vma))
1694                         continue;
1695
1696                 if (!drm_mm_node_allocated(&vma->node))
1697                         continue;
1698
1699                 list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
1700         }
1701
1702         i915 = to_i915(obj->base.dev);
1703         spin_lock(&i915->mm.obj_lock);
1704         list = obj->bind_count ? &i915->mm.bound_list : &i915->mm.unbound_list;
1705         list_move_tail(&obj->mm.link, list);
1706         spin_unlock(&i915->mm.obj_lock);
1707 }
1708
1709 /**
1710  * Called when user space prepares to use an object with the CPU, either
1711  * through the mmap ioctl's mapping or a GTT mapping.
1712  * @dev: drm device
1713  * @data: ioctl data blob
1714  * @file: drm file
1715  */
1716 int
1717 i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
1718                           struct drm_file *file)
1719 {
1720         struct drm_i915_gem_set_domain *args = data;
1721         struct drm_i915_gem_object *obj;
1722         uint32_t read_domains = args->read_domains;
1723         uint32_t write_domain = args->write_domain;
1724         int err;
1725
1726         /* Only handle setting domains to types used by the CPU. */
1727         if ((write_domain | read_domains) & I915_GEM_GPU_DOMAINS)
1728                 return -EINVAL;
1729
1730         /* Having something in the write domain implies it's in the read
1731          * domain, and only that read domain.  Enforce that in the request.
1732          */
1733         if (write_domain != 0 && read_domains != write_domain)
1734                 return -EINVAL;
1735
1736         obj = i915_gem_object_lookup(file, args->handle);
1737         if (!obj)
1738                 return -ENOENT;
1739
1740         /* Try to flush the object off the GPU without holding the lock.
1741          * We will repeat the flush holding the lock in the normal manner
1742          * to catch cases where we are gazumped.
1743          */
1744         err = i915_gem_object_wait(obj,
1745                                    I915_WAIT_INTERRUPTIBLE |
1746                                    (write_domain ? I915_WAIT_ALL : 0),
1747                                    MAX_SCHEDULE_TIMEOUT,
1748                                    to_rps_client(file));
1749         if (err)
1750                 goto out;
1751
1752         /*
1753          * Proxy objects do not control access to the backing storage, ergo
1754          * they cannot be used as a means to manipulate the cache domain
1755          * tracking for that backing storage. The proxy object is always
1756          * considered to be outside of any cache domain.
1757          */
1758         if (i915_gem_object_is_proxy(obj)) {
1759                 err = -ENXIO;
1760                 goto out;
1761         }
1762
1763         /*
1764          * Flush and acquire obj->pages so that we are coherent through
1765          * direct access in memory with previous cached writes through
1766          * shmemfs and that our cache domain tracking remains valid.
1767          * For example, if the obj->filp was moved to swap without us
1768          * being notified and releasing the pages, we would mistakenly
1769          * continue to assume that the obj remained out of the CPU cached
1770          * domain.
1771          */
1772         err = i915_gem_object_pin_pages(obj);
1773         if (err)
1774                 goto out;
1775
1776         err = i915_mutex_lock_interruptible(dev);
1777         if (err)
1778                 goto out_unpin;
1779
1780         if (read_domains & I915_GEM_DOMAIN_WC)
1781                 err = i915_gem_object_set_to_wc_domain(obj, write_domain);
1782         else if (read_domains & I915_GEM_DOMAIN_GTT)
1783                 err = i915_gem_object_set_to_gtt_domain(obj, write_domain);
1784         else
1785                 err = i915_gem_object_set_to_cpu_domain(obj, write_domain);
1786
1787         /* And bump the LRU for this access */
1788         i915_gem_object_bump_inactive_ggtt(obj);
1789
1790         mutex_unlock(&dev->struct_mutex);
1791
1792         if (write_domain != 0)
1793                 intel_fb_obj_invalidate(obj,
1794                                         fb_write_origin(obj, write_domain));
1795
1796 out_unpin:
1797         i915_gem_object_unpin_pages(obj);
1798 out:
1799         i915_gem_object_put(obj);
1800         return err;
1801 }
1802
1803 /**
1804  * Called when user space has done writes to this buffer
1805  * @dev: drm device
1806  * @data: ioctl data blob
1807  * @file: drm file
1808  */
1809 int
1810 i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
1811                          struct drm_file *file)
1812 {
1813         struct drm_i915_gem_sw_finish *args = data;
1814         struct drm_i915_gem_object *obj;
1815
1816         obj = i915_gem_object_lookup(file, args->handle);
1817         if (!obj)
1818                 return -ENOENT;
1819
1820         /*
1821          * Proxy objects are barred from CPU access, so there is no
1822          * need to ban sw_finish as it is a nop.
1823          */
1824
1825         /* Pinned buffers may be scanout, so flush the cache */
1826         i915_gem_object_flush_if_display(obj);
1827         i915_gem_object_put(obj);
1828
1829         return 0;
1830 }
1831
1832 static inline bool
1833 __vma_matches(struct vm_area_struct *vma, struct file *filp,
1834               unsigned long addr, unsigned long size)
1835 {
1836         if (vma->vm_file != filp)
1837                 return false;
1838
1839         return vma->vm_start == addr &&
1840                (vma->vm_end - vma->vm_start) == PAGE_ALIGN(size);
1841 }
1842
1843 /**
1844  * i915_gem_mmap_ioctl - Maps the contents of an object, returning the address
1845  *                       it is mapped to.
1846  * @dev: drm device
1847  * @data: ioctl data blob
1848  * @file: drm file
1849  *
1850  * While the mapping holds a reference on the contents of the object, it doesn't
1851  * imply a ref on the object itself.
1852  *
1853  * IMPORTANT:
1854  *
1855  * DRM driver writers who look a this function as an example for how to do GEM
1856  * mmap support, please don't implement mmap support like here. The modern way
1857  * to implement DRM mmap support is with an mmap offset ioctl (like
1858  * i915_gem_mmap_gtt) and then using the mmap syscall on the DRM fd directly.
1859  * That way debug tooling like valgrind will understand what's going on, hiding
1860  * the mmap call in a driver private ioctl will break that. The i915 driver only
1861  * does cpu mmaps this way because we didn't know better.
1862  */
1863 int
1864 i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
1865                     struct drm_file *file)
1866 {
1867         struct drm_i915_gem_mmap *args = data;
1868         struct drm_i915_gem_object *obj;
1869         unsigned long addr;
1870
1871         if (args->flags & ~(I915_MMAP_WC))
1872                 return -EINVAL;
1873
1874         if (args->flags & I915_MMAP_WC && !boot_cpu_has(X86_FEATURE_PAT))
1875                 return -ENODEV;
1876
1877         obj = i915_gem_object_lookup(file, args->handle);
1878         if (!obj)
1879                 return -ENOENT;
1880
1881         /* prime objects have no backing filp to GEM mmap
1882          * pages from.
1883          */
1884         if (!obj->base.filp) {
1885                 addr = -ENXIO;
1886                 goto err;
1887         }
1888
1889         if (range_overflows(args->offset, args->size, (u64)obj->base.size)) {
1890                 addr = -EINVAL;
1891                 goto err;
1892         }
1893
1894         addr = vm_mmap(obj->base.filp, 0, args->size,
1895                        PROT_READ | PROT_WRITE, MAP_SHARED,
1896                        args->offset);
1897         if (IS_ERR_VALUE(addr))
1898                 goto err;
1899
1900         if (args->flags & I915_MMAP_WC) {
1901                 struct mm_struct *mm = current->mm;
1902                 struct vm_area_struct *vma;
1903
1904                 if (down_write_killable(&mm->mmap_sem)) {
1905                         addr = -EINTR;
1906                         goto err;
1907                 }
1908                 vma = find_vma(mm, addr);
1909                 if (vma && __vma_matches(vma, obj->base.filp, addr, args->size))
1910                         vma->vm_page_prot =
1911                                 pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
1912                 else
1913                         addr = -ENOMEM;
1914                 up_write(&mm->mmap_sem);
1915                 if (IS_ERR_VALUE(addr))
1916                         goto err;
1917
1918                 /* This may race, but that's ok, it only gets set */
1919                 WRITE_ONCE(obj->frontbuffer_ggtt_origin, ORIGIN_CPU);
1920         }
1921         i915_gem_object_put(obj);
1922
1923         args->addr_ptr = (uint64_t) addr;
1924         return 0;
1925
1926 err:
1927         i915_gem_object_put(obj);
1928         return addr;
1929 }
1930
1931 static unsigned int tile_row_pages(struct drm_i915_gem_object *obj)
1932 {
1933         return i915_gem_object_get_tile_row_size(obj) >> PAGE_SHIFT;
1934 }
1935
1936 /**
1937  * i915_gem_mmap_gtt_version - report the current feature set for GTT mmaps
1938  *
1939  * A history of the GTT mmap interface:
1940  *
1941  * 0 - Everything had to fit into the GTT. Both parties of a memcpy had to
1942  *     aligned and suitable for fencing, and still fit into the available
1943  *     mappable space left by the pinned display objects. A classic problem
1944  *     we called the page-fault-of-doom where we would ping-pong between
1945  *     two objects that could not fit inside the GTT and so the memcpy
1946  *     would page one object in at the expense of the other between every
1947  *     single byte.
1948  *
1949  * 1 - Objects can be any size, and have any compatible fencing (X Y, or none
1950  *     as set via i915_gem_set_tiling() [DRM_I915_GEM_SET_TILING]). If the
1951  *     object is too large for the available space (or simply too large
1952  *     for the mappable aperture!), a view is created instead and faulted
1953  *     into userspace. (This view is aligned and sized appropriately for
1954  *     fenced access.)
1955  *
1956  * 2 - Recognise WC as a separate cache domain so that we can flush the
1957  *     delayed writes via GTT before performing direct access via WC.
1958  *
1959  * Restrictions:
1960  *
1961  *  * snoopable objects cannot be accessed via the GTT. It can cause machine
1962  *    hangs on some architectures, corruption on others. An attempt to service
1963  *    a GTT page fault from a snoopable object will generate a SIGBUS.
1964  *
1965  *  * the object must be able to fit into RAM (physical memory, though no
1966  *    limited to the mappable aperture).
1967  *
1968  *
1969  * Caveats:
1970  *
1971  *  * a new GTT page fault will synchronize rendering from the GPU and flush
1972  *    all data to system memory. Subsequent access will not be synchronized.
1973  *
1974  *  * all mappings are revoked on runtime device suspend.
1975  *
1976  *  * there are only 8, 16 or 32 fence registers to share between all users
1977  *    (older machines require fence register for display and blitter access
1978  *    as well). Contention of the fence registers will cause the previous users
1979  *    to be unmapped and any new access will generate new page faults.
1980  *
1981  *  * running out of memory while servicing a fault may generate a SIGBUS,
1982  *    rather than the expected SIGSEGV.
1983  */
1984 int i915_gem_mmap_gtt_version(void)
1985 {
1986         return 2;
1987 }
1988
1989 static inline struct i915_ggtt_view
1990 compute_partial_view(struct drm_i915_gem_object *obj,
1991                      pgoff_t page_offset,
1992                      unsigned int chunk)
1993 {
1994         struct i915_ggtt_view view;
1995
1996         if (i915_gem_object_is_tiled(obj))
1997                 chunk = roundup(chunk, tile_row_pages(obj));
1998
1999         view.type = I915_GGTT_VIEW_PARTIAL;
2000         view.partial.offset = rounddown(page_offset, chunk);
2001         view.partial.size =
2002                 min_t(unsigned int, chunk,
2003                       (obj->base.size >> PAGE_SHIFT) - view.partial.offset);
2004
2005         /* If the partial covers the entire object, just create a normal VMA. */
2006         if (chunk >= obj->base.size >> PAGE_SHIFT)
2007                 view.type = I915_GGTT_VIEW_NORMAL;
2008
2009         return view;
2010 }
2011
2012 /**
2013  * i915_gem_fault - fault a page into the GTT
2014  * @vmf: fault info
2015  *
2016  * The fault handler is set up by drm_gem_mmap() when a object is GTT mapped
2017  * from userspace.  The fault handler takes care of binding the object to
2018  * the GTT (if needed), allocating and programming a fence register (again,
2019  * only if needed based on whether the old reg is still valid or the object
2020  * is tiled) and inserting a new PTE into the faulting process.
2021  *
2022  * Note that the faulting process may involve evicting existing objects
2023  * from the GTT and/or fence registers to make room.  So performance may
2024  * suffer if the GTT working set is large or there are few fence registers
2025  * left.
2026  *
2027  * The current feature set supported by i915_gem_fault() and thus GTT mmaps
2028  * is exposed via I915_PARAM_MMAP_GTT_VERSION (see i915_gem_mmap_gtt_version).
2029  */
2030 vm_fault_t i915_gem_fault(struct vm_fault *vmf)
2031 {
2032 #define MIN_CHUNK_PAGES (SZ_1M >> PAGE_SHIFT)
2033         struct vm_area_struct *area = vmf->vma;
2034         struct drm_i915_gem_object *obj = to_intel_bo(area->vm_private_data);
2035         struct drm_device *dev = obj->base.dev;
2036         struct drm_i915_private *dev_priv = to_i915(dev);
2037         struct i915_ggtt *ggtt = &dev_priv->ggtt;
2038         bool write = !!(vmf->flags & FAULT_FLAG_WRITE);
2039         struct i915_vma *vma;
2040         pgoff_t page_offset;
2041         int ret;
2042
2043         /* Sanity check that we allow writing into this object */
2044         if (i915_gem_object_is_readonly(obj) && write)
2045                 return VM_FAULT_SIGBUS;
2046
2047         /* We don't use vmf->pgoff since that has the fake offset */
2048         page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT;
2049
2050         trace_i915_gem_object_fault(obj, page_offset, true, write);
2051
2052         /* Try to flush the object off the GPU first without holding the lock.
2053          * Upon acquiring the lock, we will perform our sanity checks and then
2054          * repeat the flush holding the lock in the normal manner to catch cases
2055          * where we are gazumped.
2056          */
2057         ret = i915_gem_object_wait(obj,
2058                                    I915_WAIT_INTERRUPTIBLE,
2059                                    MAX_SCHEDULE_TIMEOUT,
2060                                    NULL);
2061         if (ret)
2062                 goto err;
2063
2064         ret = i915_gem_object_pin_pages(obj);
2065         if (ret)
2066                 goto err;
2067
2068         intel_runtime_pm_get(dev_priv);
2069
2070         ret = i915_mutex_lock_interruptible(dev);
2071         if (ret)
2072                 goto err_rpm;
2073
2074         /* Access to snoopable pages through the GTT is incoherent. */
2075         if (obj->cache_level != I915_CACHE_NONE && !HAS_LLC(dev_priv)) {
2076                 ret = -EFAULT;
2077                 goto err_unlock;
2078         }
2079
2080
2081         /* Now pin it into the GTT as needed */
2082         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
2083                                        PIN_MAPPABLE |
2084                                        PIN_NONBLOCK |
2085                                        PIN_NONFAULT);
2086         if (IS_ERR(vma)) {
2087                 /* Use a partial view if it is bigger than available space */
2088                 struct i915_ggtt_view view =
2089                         compute_partial_view(obj, page_offset, MIN_CHUNK_PAGES);
2090                 unsigned int flags;
2091
2092                 flags = PIN_MAPPABLE;
2093                 if (view.type == I915_GGTT_VIEW_NORMAL)
2094                         flags |= PIN_NONBLOCK; /* avoid warnings for pinned */
2095
2096                 /*
2097                  * Userspace is now writing through an untracked VMA, abandon
2098                  * all hope that the hardware is able to track future writes.
2099                  */
2100                 obj->frontbuffer_ggtt_origin = ORIGIN_CPU;
2101
2102                 vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
2103                 if (IS_ERR(vma) && !view.type) {
2104                         flags = PIN_MAPPABLE;
2105                         view.type = I915_GGTT_VIEW_PARTIAL;
2106                         vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
2107                 }
2108         }
2109         if (IS_ERR(vma)) {
2110                 ret = PTR_ERR(vma);
2111                 goto err_unlock;
2112         }
2113
2114         ret = i915_gem_object_set_to_gtt_domain(obj, write);
2115         if (ret)
2116                 goto err_unpin;
2117
2118         ret = i915_vma_pin_fence(vma);
2119         if (ret)
2120                 goto err_unpin;
2121
2122         /* Finally, remap it using the new GTT offset */
2123         ret = remap_io_mapping(area,
2124                                area->vm_start + (vma->ggtt_view.partial.offset << PAGE_SHIFT),
2125                                (ggtt->gmadr.start + vma->node.start) >> PAGE_SHIFT,
2126                                min_t(u64, vma->size, area->vm_end - area->vm_start),
2127                                &ggtt->iomap);
2128         if (ret)
2129                 goto err_fence;
2130
2131         /* Mark as being mmapped into userspace for later revocation */
2132         assert_rpm_wakelock_held(dev_priv);
2133         if (!i915_vma_set_userfault(vma) && !obj->userfault_count++)
2134                 list_add(&obj->userfault_link, &dev_priv->mm.userfault_list);
2135         GEM_BUG_ON(!obj->userfault_count);
2136
2137         i915_vma_set_ggtt_write(vma);
2138
2139 err_fence:
2140         i915_vma_unpin_fence(vma);
2141 err_unpin:
2142         __i915_vma_unpin(vma);
2143 err_unlock:
2144         mutex_unlock(&dev->struct_mutex);
2145 err_rpm:
2146         intel_runtime_pm_put(dev_priv);
2147         i915_gem_object_unpin_pages(obj);
2148 err:
2149         switch (ret) {
2150         case -EIO:
2151                 /*
2152                  * We eat errors when the gpu is terminally wedged to avoid
2153                  * userspace unduly crashing (gl has no provisions for mmaps to
2154                  * fail). But any other -EIO isn't ours (e.g. swap in failure)
2155                  * and so needs to be reported.
2156                  */
2157                 if (!i915_terminally_wedged(&dev_priv->gpu_error))
2158                         return VM_FAULT_SIGBUS;
2159                 /* else: fall through */
2160         case -EAGAIN:
2161                 /*
2162                  * EAGAIN means the gpu is hung and we'll wait for the error
2163                  * handler to reset everything when re-faulting in
2164                  * i915_mutex_lock_interruptible.
2165                  */
2166         case 0:
2167         case -ERESTARTSYS:
2168         case -EINTR:
2169         case -EBUSY:
2170                 /*
2171                  * EBUSY is ok: this just means that another thread
2172                  * already did the job.
2173                  */
2174                 return VM_FAULT_NOPAGE;
2175         case -ENOMEM:
2176                 return VM_FAULT_OOM;
2177         case -ENOSPC:
2178         case -EFAULT:
2179                 return VM_FAULT_SIGBUS;
2180         default:
2181                 WARN_ONCE(ret, "unhandled error in i915_gem_fault: %i\n", ret);
2182                 return VM_FAULT_SIGBUS;
2183         }
2184 }
2185
2186 static void __i915_gem_object_release_mmap(struct drm_i915_gem_object *obj)
2187 {
2188         struct i915_vma *vma;
2189
2190         GEM_BUG_ON(!obj->userfault_count);
2191
2192         obj->userfault_count = 0;
2193         list_del(&obj->userfault_link);
2194         drm_vma_node_unmap(&obj->base.vma_node,
2195                            obj->base.dev->anon_inode->i_mapping);
2196
2197         for_each_ggtt_vma(vma, obj)
2198                 i915_vma_unset_userfault(vma);
2199 }
2200
2201 /**
2202  * i915_gem_release_mmap - remove physical page mappings
2203  * @obj: obj in question
2204  *
2205  * Preserve the reservation of the mmapping with the DRM core code, but
2206  * relinquish ownership of the pages back to the system.
2207  *
2208  * It is vital that we remove the page mapping if we have mapped a tiled
2209  * object through the GTT and then lose the fence register due to
2210  * resource pressure. Similarly if the object has been moved out of the
2211  * aperture, than pages mapped into userspace must be revoked. Removing the
2212  * mapping will then trigger a page fault on the next user access, allowing
2213  * fixup by i915_gem_fault().
2214  */
2215 void
2216 i915_gem_release_mmap(struct drm_i915_gem_object *obj)
2217 {
2218         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2219
2220         /* Serialisation between user GTT access and our code depends upon
2221          * revoking the CPU's PTE whilst the mutex is held. The next user
2222          * pagefault then has to wait until we release the mutex.
2223          *
2224          * Note that RPM complicates somewhat by adding an additional
2225          * requirement that operations to the GGTT be made holding the RPM
2226          * wakeref.
2227          */
2228         lockdep_assert_held(&i915->drm.struct_mutex);
2229         intel_runtime_pm_get(i915);
2230
2231         if (!obj->userfault_count)
2232                 goto out;
2233
2234         __i915_gem_object_release_mmap(obj);
2235
2236         /* Ensure that the CPU's PTE are revoked and there are not outstanding
2237          * memory transactions from userspace before we return. The TLB
2238          * flushing implied above by changing the PTE above *should* be
2239          * sufficient, an extra barrier here just provides us with a bit
2240          * of paranoid documentation about our requirement to serialise
2241          * memory writes before touching registers / GSM.
2242          */
2243         wmb();
2244
2245 out:
2246         intel_runtime_pm_put(i915);
2247 }
2248
2249 void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv)
2250 {
2251         struct drm_i915_gem_object *obj, *on;
2252         int i;
2253
2254         /*
2255          * Only called during RPM suspend. All users of the userfault_list
2256          * must be holding an RPM wakeref to ensure that this can not
2257          * run concurrently with themselves (and use the struct_mutex for
2258          * protection between themselves).
2259          */
2260
2261         list_for_each_entry_safe(obj, on,
2262                                  &dev_priv->mm.userfault_list, userfault_link)
2263                 __i915_gem_object_release_mmap(obj);
2264
2265         /* The fence will be lost when the device powers down. If any were
2266          * in use by hardware (i.e. they are pinned), we should not be powering
2267          * down! All other fences will be reacquired by the user upon waking.
2268          */
2269         for (i = 0; i < dev_priv->num_fence_regs; i++) {
2270                 struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
2271
2272                 /* Ideally we want to assert that the fence register is not
2273                  * live at this point (i.e. that no piece of code will be
2274                  * trying to write through fence + GTT, as that both violates
2275                  * our tracking of activity and associated locking/barriers,
2276                  * but also is illegal given that the hw is powered down).
2277                  *
2278                  * Previously we used reg->pin_count as a "liveness" indicator.
2279                  * That is not sufficient, and we need a more fine-grained
2280                  * tool if we want to have a sanity check here.
2281                  */
2282
2283                 if (!reg->vma)
2284                         continue;
2285
2286                 GEM_BUG_ON(i915_vma_has_userfault(reg->vma));
2287                 reg->dirty = true;
2288         }
2289 }
2290
2291 static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj)
2292 {
2293         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2294         int err;
2295
2296         err = drm_gem_create_mmap_offset(&obj->base);
2297         if (likely(!err))
2298                 return 0;
2299
2300         /* Attempt to reap some mmap space from dead objects */
2301         do {
2302                 err = i915_gem_wait_for_idle(dev_priv,
2303                                              I915_WAIT_INTERRUPTIBLE,
2304                                              MAX_SCHEDULE_TIMEOUT);
2305                 if (err)
2306                         break;
2307
2308                 i915_gem_drain_freed_objects(dev_priv);
2309                 err = drm_gem_create_mmap_offset(&obj->base);
2310                 if (!err)
2311                         break;
2312
2313         } while (flush_delayed_work(&dev_priv->gt.retire_work));
2314
2315         return err;
2316 }
2317
2318 static void i915_gem_object_free_mmap_offset(struct drm_i915_gem_object *obj)
2319 {
2320         drm_gem_free_mmap_offset(&obj->base);
2321 }
2322
2323 int
2324 i915_gem_mmap_gtt(struct drm_file *file,
2325                   struct drm_device *dev,
2326                   uint32_t handle,
2327                   uint64_t *offset)
2328 {
2329         struct drm_i915_gem_object *obj;
2330         int ret;
2331
2332         obj = i915_gem_object_lookup(file, handle);
2333         if (!obj)
2334                 return -ENOENT;
2335
2336         ret = i915_gem_object_create_mmap_offset(obj);
2337         if (ret == 0)
2338                 *offset = drm_vma_node_offset_addr(&obj->base.vma_node);
2339
2340         i915_gem_object_put(obj);
2341         return ret;
2342 }
2343
2344 /**
2345  * i915_gem_mmap_gtt_ioctl - prepare an object for GTT mmap'ing
2346  * @dev: DRM device
2347  * @data: GTT mapping ioctl data
2348  * @file: GEM object info
2349  *
2350  * Simply returns the fake offset to userspace so it can mmap it.
2351  * The mmap call will end up in drm_gem_mmap(), which will set things
2352  * up so we can get faults in the handler above.
2353  *
2354  * The fault handler will take care of binding the object into the GTT
2355  * (since it may have been evicted to make room for something), allocating
2356  * a fence register, and mapping the appropriate aperture address into
2357  * userspace.
2358  */
2359 int
2360 i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data,
2361                         struct drm_file *file)
2362 {
2363         struct drm_i915_gem_mmap_gtt *args = data;
2364
2365         return i915_gem_mmap_gtt(file, dev, args->handle, &args->offset);
2366 }
2367
2368 /* Immediately discard the backing storage */
2369 static void
2370 i915_gem_object_truncate(struct drm_i915_gem_object *obj)
2371 {
2372         i915_gem_object_free_mmap_offset(obj);
2373
2374         if (obj->base.filp == NULL)
2375                 return;
2376
2377         /* Our goal here is to return as much of the memory as
2378          * is possible back to the system as we are called from OOM.
2379          * To do this we must instruct the shmfs to drop all of its
2380          * backing pages, *now*.
2381          */
2382         shmem_truncate_range(file_inode(obj->base.filp), 0, (loff_t)-1);
2383         obj->mm.madv = __I915_MADV_PURGED;
2384         obj->mm.pages = ERR_PTR(-EFAULT);
2385 }
2386
2387 /* Try to discard unwanted pages */
2388 void __i915_gem_object_invalidate(struct drm_i915_gem_object *obj)
2389 {
2390         struct address_space *mapping;
2391
2392         lockdep_assert_held(&obj->mm.lock);
2393         GEM_BUG_ON(i915_gem_object_has_pages(obj));
2394
2395         switch (obj->mm.madv) {
2396         case I915_MADV_DONTNEED:
2397                 i915_gem_object_truncate(obj);
2398         case __I915_MADV_PURGED:
2399                 return;
2400         }
2401
2402         if (obj->base.filp == NULL)
2403                 return;
2404
2405         mapping = obj->base.filp->f_mapping,
2406         invalidate_mapping_pages(mapping, 0, (loff_t)-1);
2407 }
2408
2409 static void
2410 i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj,
2411                               struct sg_table *pages)
2412 {
2413         struct sgt_iter sgt_iter;
2414         struct page *page;
2415
2416         __i915_gem_object_release_shmem(obj, pages, true);
2417
2418         i915_gem_gtt_finish_pages(obj, pages);
2419
2420         if (i915_gem_object_needs_bit17_swizzle(obj))
2421                 i915_gem_object_save_bit_17_swizzle(obj, pages);
2422
2423         for_each_sgt_page(page, sgt_iter, pages) {
2424                 if (obj->mm.dirty)
2425                         set_page_dirty(page);
2426
2427                 if (obj->mm.madv == I915_MADV_WILLNEED)
2428                         mark_page_accessed(page);
2429
2430                 put_page(page);
2431         }
2432         obj->mm.dirty = false;
2433
2434         sg_free_table(pages);
2435         kfree(pages);
2436 }
2437
2438 static void __i915_gem_object_reset_page_iter(struct drm_i915_gem_object *obj)
2439 {
2440         struct radix_tree_iter iter;
2441         void __rcu **slot;
2442
2443         rcu_read_lock();
2444         radix_tree_for_each_slot(slot, &obj->mm.get_page.radix, &iter, 0)
2445                 radix_tree_delete(&obj->mm.get_page.radix, iter.index);
2446         rcu_read_unlock();
2447 }
2448
2449 static struct sg_table *
2450 __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
2451 {
2452         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2453         struct sg_table *pages;
2454
2455         pages = fetch_and_zero(&obj->mm.pages);
2456         if (!pages)
2457                 return NULL;
2458
2459         spin_lock(&i915->mm.obj_lock);
2460         list_del(&obj->mm.link);
2461         spin_unlock(&i915->mm.obj_lock);
2462
2463         if (obj->mm.mapping) {
2464                 void *ptr;
2465
2466                 ptr = page_mask_bits(obj->mm.mapping);
2467                 if (is_vmalloc_addr(ptr))
2468                         vunmap(ptr);
2469                 else
2470                         kunmap(kmap_to_page(ptr));
2471
2472                 obj->mm.mapping = NULL;
2473         }
2474
2475         __i915_gem_object_reset_page_iter(obj);
2476         obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0;
2477
2478         return pages;
2479 }
2480
2481 void __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
2482                                  enum i915_mm_subclass subclass)
2483 {
2484         struct sg_table *pages;
2485
2486         if (i915_gem_object_has_pinned_pages(obj))
2487                 return;
2488
2489         GEM_BUG_ON(obj->bind_count);
2490         if (!i915_gem_object_has_pages(obj))
2491                 return;
2492
2493         /* May be called by shrinker from within get_pages() (on another bo) */
2494         mutex_lock_nested(&obj->mm.lock, subclass);
2495         if (unlikely(atomic_read(&obj->mm.pages_pin_count)))
2496                 goto unlock;
2497
2498         /*
2499          * ->put_pages might need to allocate memory for the bit17 swizzle
2500          * array, hence protect them from being reaped by removing them from gtt
2501          * lists early.
2502          */
2503         pages = __i915_gem_object_unset_pages(obj);
2504         if (!IS_ERR(pages))
2505                 obj->ops->put_pages(obj, pages);
2506
2507 unlock:
2508         mutex_unlock(&obj->mm.lock);
2509 }
2510
2511 static bool i915_sg_trim(struct sg_table *orig_st)
2512 {
2513         struct sg_table new_st;
2514         struct scatterlist *sg, *new_sg;
2515         unsigned int i;
2516
2517         if (orig_st->nents == orig_st->orig_nents)
2518                 return false;
2519
2520         if (sg_alloc_table(&new_st, orig_st->nents, GFP_KERNEL | __GFP_NOWARN))
2521                 return false;
2522
2523         new_sg = new_st.sgl;
2524         for_each_sg(orig_st->sgl, sg, orig_st->nents, i) {
2525                 sg_set_page(new_sg, sg_page(sg), sg->length, 0);
2526                 /* called before being DMA mapped, no need to copy sg->dma_* */
2527                 new_sg = sg_next(new_sg);
2528         }
2529         GEM_BUG_ON(new_sg); /* Should walk exactly nents and hit the end */
2530
2531         sg_free_table(orig_st);
2532
2533         *orig_st = new_st;
2534         return true;
2535 }
2536
2537 static int i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
2538 {
2539         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2540         const unsigned long page_count = obj->base.size / PAGE_SIZE;
2541         unsigned long i;
2542         struct address_space *mapping;
2543         struct sg_table *st;
2544         struct scatterlist *sg;
2545         struct sgt_iter sgt_iter;
2546         struct page *page;
2547         unsigned long last_pfn = 0;     /* suppress gcc warning */
2548         unsigned int max_segment = i915_sg_segment_size();
2549         unsigned int sg_page_sizes;
2550         gfp_t noreclaim;
2551         int ret;
2552
2553         /* Assert that the object is not currently in any GPU domain. As it
2554          * wasn't in the GTT, there shouldn't be any way it could have been in
2555          * a GPU cache
2556          */
2557         GEM_BUG_ON(obj->read_domains & I915_GEM_GPU_DOMAINS);
2558         GEM_BUG_ON(obj->write_domain & I915_GEM_GPU_DOMAINS);
2559
2560         st = kmalloc(sizeof(*st), GFP_KERNEL);
2561         if (st == NULL)
2562                 return -ENOMEM;
2563
2564 rebuild_st:
2565         if (sg_alloc_table(st, page_count, GFP_KERNEL)) {
2566                 kfree(st);
2567                 return -ENOMEM;
2568         }
2569
2570         /* Get the list of pages out of our struct file.  They'll be pinned
2571          * at this point until we release them.
2572          *
2573          * Fail silently without starting the shrinker
2574          */
2575         mapping = obj->base.filp->f_mapping;
2576         noreclaim = mapping_gfp_constraint(mapping, ~__GFP_RECLAIM);
2577         noreclaim |= __GFP_NORETRY | __GFP_NOWARN;
2578
2579         sg = st->sgl;
2580         st->nents = 0;
2581         sg_page_sizes = 0;
2582         for (i = 0; i < page_count; i++) {
2583                 const unsigned int shrink[] = {
2584                         I915_SHRINK_BOUND | I915_SHRINK_UNBOUND | I915_SHRINK_PURGEABLE,
2585                         0,
2586                 }, *s = shrink;
2587                 gfp_t gfp = noreclaim;
2588
2589                 do {
2590                         page = shmem_read_mapping_page_gfp(mapping, i, gfp);
2591                         if (likely(!IS_ERR(page)))
2592                                 break;
2593
2594                         if (!*s) {
2595                                 ret = PTR_ERR(page);
2596                                 goto err_sg;
2597                         }
2598
2599                         i915_gem_shrink(dev_priv, 2 * page_count, NULL, *s++);
2600                         cond_resched();
2601
2602                         /* We've tried hard to allocate the memory by reaping
2603                          * our own buffer, now let the real VM do its job and
2604                          * go down in flames if truly OOM.
2605                          *
2606                          * However, since graphics tend to be disposable,
2607                          * defer the oom here by reporting the ENOMEM back
2608                          * to userspace.
2609                          */
2610                         if (!*s) {
2611                                 /* reclaim and warn, but no oom */
2612                                 gfp = mapping_gfp_mask(mapping);
2613
2614                                 /* Our bo are always dirty and so we require
2615                                  * kswapd to reclaim our pages (direct reclaim
2616                                  * does not effectively begin pageout of our
2617                                  * buffers on its own). However, direct reclaim
2618                                  * only waits for kswapd when under allocation
2619                                  * congestion. So as a result __GFP_RECLAIM is
2620                                  * unreliable and fails to actually reclaim our
2621                                  * dirty pages -- unless you try over and over
2622                                  * again with !__GFP_NORETRY. However, we still
2623                                  * want to fail this allocation rather than
2624                                  * trigger the out-of-memory killer and for
2625                                  * this we want __GFP_RETRY_MAYFAIL.
2626                                  */
2627                                 gfp |= __GFP_RETRY_MAYFAIL;
2628                         }
2629                 } while (1);
2630
2631                 if (!i ||
2632                     sg->length >= max_segment ||
2633                     page_to_pfn(page) != last_pfn + 1) {
2634                         if (i) {
2635                                 sg_page_sizes |= sg->length;
2636                                 sg = sg_next(sg);
2637                         }
2638                         st->nents++;
2639                         sg_set_page(sg, page, PAGE_SIZE, 0);
2640                 } else {
2641                         sg->length += PAGE_SIZE;
2642                 }
2643                 last_pfn = page_to_pfn(page);
2644
2645                 /* Check that the i965g/gm workaround works. */
2646                 WARN_ON((gfp & __GFP_DMA32) && (last_pfn >= 0x00100000UL));
2647         }
2648         if (sg) { /* loop terminated early; short sg table */
2649                 sg_page_sizes |= sg->length;
2650                 sg_mark_end(sg);
2651         }
2652
2653         /* Trim unused sg entries to avoid wasting memory. */
2654         i915_sg_trim(st);
2655
2656         ret = i915_gem_gtt_prepare_pages(obj, st);
2657         if (ret) {
2658                 /* DMA remapping failed? One possible cause is that
2659                  * it could not reserve enough large entries, asking
2660                  * for PAGE_SIZE chunks instead may be helpful.
2661                  */
2662                 if (max_segment > PAGE_SIZE) {
2663                         for_each_sgt_page(page, sgt_iter, st)
2664                                 put_page(page);
2665                         sg_free_table(st);
2666
2667                         max_segment = PAGE_SIZE;
2668                         goto rebuild_st;
2669                 } else {
2670                         dev_warn(&dev_priv->drm.pdev->dev,
2671                                  "Failed to DMA remap %lu pages\n",
2672                                  page_count);
2673                         goto err_pages;
2674                 }
2675         }
2676
2677         if (i915_gem_object_needs_bit17_swizzle(obj))
2678                 i915_gem_object_do_bit_17_swizzle(obj, st);
2679
2680         __i915_gem_object_set_pages(obj, st, sg_page_sizes);
2681
2682         return 0;
2683
2684 err_sg:
2685         sg_mark_end(sg);
2686 err_pages:
2687         for_each_sgt_page(page, sgt_iter, st)
2688                 put_page(page);
2689         sg_free_table(st);
2690         kfree(st);
2691
2692         /* shmemfs first checks if there is enough memory to allocate the page
2693          * and reports ENOSPC should there be insufficient, along with the usual
2694          * ENOMEM for a genuine allocation failure.
2695          *
2696          * We use ENOSPC in our driver to mean that we have run out of aperture
2697          * space and so want to translate the error from shmemfs back to our
2698          * usual understanding of ENOMEM.
2699          */
2700         if (ret == -ENOSPC)
2701                 ret = -ENOMEM;
2702
2703         return ret;
2704 }
2705
2706 void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
2707                                  struct sg_table *pages,
2708                                  unsigned int sg_page_sizes)
2709 {
2710         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2711         unsigned long supported = INTEL_INFO(i915)->page_sizes;
2712         int i;
2713
2714         lockdep_assert_held(&obj->mm.lock);
2715
2716         obj->mm.get_page.sg_pos = pages->sgl;
2717         obj->mm.get_page.sg_idx = 0;
2718
2719         obj->mm.pages = pages;
2720
2721         if (i915_gem_object_is_tiled(obj) &&
2722             i915->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
2723                 GEM_BUG_ON(obj->mm.quirked);
2724                 __i915_gem_object_pin_pages(obj);
2725                 obj->mm.quirked = true;
2726         }
2727
2728         GEM_BUG_ON(!sg_page_sizes);
2729         obj->mm.page_sizes.phys = sg_page_sizes;
2730
2731         /*
2732          * Calculate the supported page-sizes which fit into the given
2733          * sg_page_sizes. This will give us the page-sizes which we may be able
2734          * to use opportunistically when later inserting into the GTT. For
2735          * example if phys=2G, then in theory we should be able to use 1G, 2M,
2736          * 64K or 4K pages, although in practice this will depend on a number of
2737          * other factors.
2738          */
2739         obj->mm.page_sizes.sg = 0;
2740         for_each_set_bit(i, &supported, ilog2(I915_GTT_MAX_PAGE_SIZE) + 1) {
2741                 if (obj->mm.page_sizes.phys & ~0u << i)
2742                         obj->mm.page_sizes.sg |= BIT(i);
2743         }
2744         GEM_BUG_ON(!HAS_PAGE_SIZES(i915, obj->mm.page_sizes.sg));
2745
2746         spin_lock(&i915->mm.obj_lock);
2747         list_add(&obj->mm.link, &i915->mm.unbound_list);
2748         spin_unlock(&i915->mm.obj_lock);
2749 }
2750
2751 static int ____i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2752 {
2753         int err;
2754
2755         if (unlikely(obj->mm.madv != I915_MADV_WILLNEED)) {
2756                 DRM_DEBUG("Attempting to obtain a purgeable object\n");
2757                 return -EFAULT;
2758         }
2759
2760         err = obj->ops->get_pages(obj);
2761         GEM_BUG_ON(!err && !i915_gem_object_has_pages(obj));
2762
2763         return err;
2764 }
2765
2766 /* Ensure that the associated pages are gathered from the backing storage
2767  * and pinned into our object. i915_gem_object_pin_pages() may be called
2768  * multiple times before they are released by a single call to
2769  * i915_gem_object_unpin_pages() - once the pages are no longer referenced
2770  * either as a result of memory pressure (reaping pages under the shrinker)
2771  * or as the object is itself released.
2772  */
2773 int __i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2774 {
2775         int err;
2776
2777         err = mutex_lock_interruptible(&obj->mm.lock);
2778         if (err)
2779                 return err;
2780
2781         if (unlikely(!i915_gem_object_has_pages(obj))) {
2782                 GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2783
2784                 err = ____i915_gem_object_get_pages(obj);
2785                 if (err)
2786                         goto unlock;
2787
2788                 smp_mb__before_atomic();
2789         }
2790         atomic_inc(&obj->mm.pages_pin_count);
2791
2792 unlock:
2793         mutex_unlock(&obj->mm.lock);
2794         return err;
2795 }
2796
2797 /* The 'mapping' part of i915_gem_object_pin_map() below */
2798 static void *i915_gem_object_map(const struct drm_i915_gem_object *obj,
2799                                  enum i915_map_type type)
2800 {
2801         unsigned long n_pages = obj->base.size >> PAGE_SHIFT;
2802         struct sg_table *sgt = obj->mm.pages;
2803         struct sgt_iter sgt_iter;
2804         struct page *page;
2805         struct page *stack_pages[32];
2806         struct page **pages = stack_pages;
2807         unsigned long i = 0;
2808         pgprot_t pgprot;
2809         void *addr;
2810
2811         /* A single page can always be kmapped */
2812         if (n_pages == 1 && type == I915_MAP_WB)
2813                 return kmap(sg_page(sgt->sgl));
2814
2815         if (n_pages > ARRAY_SIZE(stack_pages)) {
2816                 /* Too big for stack -- allocate temporary array instead */
2817                 pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL);
2818                 if (!pages)
2819                         return NULL;
2820         }
2821
2822         for_each_sgt_page(page, sgt_iter, sgt)
2823                 pages[i++] = page;
2824
2825         /* Check that we have the expected number of pages */
2826         GEM_BUG_ON(i != n_pages);
2827
2828         switch (type) {
2829         default:
2830                 MISSING_CASE(type);
2831                 /* fallthrough to use PAGE_KERNEL anyway */
2832         case I915_MAP_WB:
2833                 pgprot = PAGE_KERNEL;
2834                 break;
2835         case I915_MAP_WC:
2836                 pgprot = pgprot_writecombine(PAGE_KERNEL_IO);
2837                 break;
2838         }
2839         addr = vmap(pages, n_pages, 0, pgprot);
2840
2841         if (pages != stack_pages)
2842                 kvfree(pages);
2843
2844         return addr;
2845 }
2846
2847 /* get, pin, and map the pages of the object into kernel space */
2848 void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
2849                               enum i915_map_type type)
2850 {
2851         enum i915_map_type has_type;
2852         bool pinned;
2853         void *ptr;
2854         int ret;
2855
2856         if (unlikely(!i915_gem_object_has_struct_page(obj)))
2857                 return ERR_PTR(-ENXIO);
2858
2859         ret = mutex_lock_interruptible(&obj->mm.lock);
2860         if (ret)
2861                 return ERR_PTR(ret);
2862
2863         pinned = !(type & I915_MAP_OVERRIDE);
2864         type &= ~I915_MAP_OVERRIDE;
2865
2866         if (!atomic_inc_not_zero(&obj->mm.pages_pin_count)) {
2867                 if (unlikely(!i915_gem_object_has_pages(obj))) {
2868                         GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2869
2870                         ret = ____i915_gem_object_get_pages(obj);
2871                         if (ret)
2872                                 goto err_unlock;
2873
2874                         smp_mb__before_atomic();
2875                 }
2876                 atomic_inc(&obj->mm.pages_pin_count);
2877                 pinned = false;
2878         }
2879         GEM_BUG_ON(!i915_gem_object_has_pages(obj));
2880
2881         ptr = page_unpack_bits(obj->mm.mapping, &has_type);
2882         if (ptr && has_type != type) {
2883                 if (pinned) {
2884                         ret = -EBUSY;
2885                         goto err_unpin;
2886                 }
2887
2888                 if (is_vmalloc_addr(ptr))
2889                         vunmap(ptr);
2890                 else
2891                         kunmap(kmap_to_page(ptr));
2892
2893                 ptr = obj->mm.mapping = NULL;
2894         }
2895
2896         if (!ptr) {
2897                 ptr = i915_gem_object_map(obj, type);
2898                 if (!ptr) {
2899                         ret = -ENOMEM;
2900                         goto err_unpin;
2901                 }
2902
2903                 obj->mm.mapping = page_pack_bits(ptr, type);
2904         }
2905
2906 out_unlock:
2907         mutex_unlock(&obj->mm.lock);
2908         return ptr;
2909
2910 err_unpin:
2911         atomic_dec(&obj->mm.pages_pin_count);
2912 err_unlock:
2913         ptr = ERR_PTR(ret);
2914         goto out_unlock;
2915 }
2916
2917 static int
2918 i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
2919                            const struct drm_i915_gem_pwrite *arg)
2920 {
2921         struct address_space *mapping = obj->base.filp->f_mapping;
2922         char __user *user_data = u64_to_user_ptr(arg->data_ptr);
2923         u64 remain, offset;
2924         unsigned int pg;
2925
2926         /* Before we instantiate/pin the backing store for our use, we
2927          * can prepopulate the shmemfs filp efficiently using a write into
2928          * the pagecache. We avoid the penalty of instantiating all the
2929          * pages, important if the user is just writing to a few and never
2930          * uses the object on the GPU, and using a direct write into shmemfs
2931          * allows it to avoid the cost of retrieving a page (either swapin
2932          * or clearing-before-use) before it is overwritten.
2933          */
2934         if (i915_gem_object_has_pages(obj))
2935                 return -ENODEV;
2936
2937         if (obj->mm.madv != I915_MADV_WILLNEED)
2938                 return -EFAULT;
2939
2940         /* Before the pages are instantiated the object is treated as being
2941          * in the CPU domain. The pages will be clflushed as required before
2942          * use, and we can freely write into the pages directly. If userspace
2943          * races pwrite with any other operation; corruption will ensue -
2944          * that is userspace's prerogative!
2945          */
2946
2947         remain = arg->size;
2948         offset = arg->offset;
2949         pg = offset_in_page(offset);
2950
2951         do {
2952                 unsigned int len, unwritten;
2953                 struct page *page;
2954                 void *data, *vaddr;
2955                 int err;
2956
2957                 len = PAGE_SIZE - pg;
2958                 if (len > remain)
2959                         len = remain;
2960
2961                 err = pagecache_write_begin(obj->base.filp, mapping,
2962                                             offset, len, 0,
2963                                             &page, &data);
2964                 if (err < 0)
2965                         return err;
2966
2967                 vaddr = kmap(page);
2968                 unwritten = copy_from_user(vaddr + pg, user_data, len);
2969                 kunmap(page);
2970
2971                 err = pagecache_write_end(obj->base.filp, mapping,
2972                                           offset, len, len - unwritten,
2973                                           page, data);
2974                 if (err < 0)
2975                         return err;
2976
2977                 if (unwritten)
2978                         return -EFAULT;
2979
2980                 remain -= len;
2981                 user_data += len;
2982                 offset += len;
2983                 pg = 0;
2984         } while (remain);
2985
2986         return 0;
2987 }
2988
2989 static void i915_gem_client_mark_guilty(struct drm_i915_file_private *file_priv,
2990                                         const struct i915_gem_context *ctx)
2991 {
2992         unsigned int score;
2993         unsigned long prev_hang;
2994
2995         if (i915_gem_context_is_banned(ctx))
2996                 score = I915_CLIENT_SCORE_CONTEXT_BAN;
2997         else
2998                 score = 0;
2999
3000         prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
3001         if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
3002                 score += I915_CLIENT_SCORE_HANG_FAST;
3003
3004         if (score) {
3005                 atomic_add(score, &file_priv->ban_score);
3006
3007                 DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
3008                                  ctx->name, score,
3009                                  atomic_read(&file_priv->ban_score));
3010         }
3011 }
3012
3013 static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
3014 {
3015         unsigned int score;
3016         bool banned, bannable;
3017
3018         atomic_inc(&ctx->guilty_count);
3019
3020         bannable = i915_gem_context_is_bannable(ctx);
3021         score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
3022         banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
3023
3024         /* Cool contexts don't accumulate client ban score */
3025         if (!bannable)
3026                 return;
3027
3028         if (banned) {
3029                 DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
3030                                  ctx->name, atomic_read(&ctx->guilty_count),
3031                                  score);
3032                 i915_gem_context_set_banned(ctx);
3033         }
3034
3035         if (!IS_ERR_OR_NULL(ctx->file_priv))
3036                 i915_gem_client_mark_guilty(ctx->file_priv, ctx);
3037 }
3038
3039 static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
3040 {
3041         atomic_inc(&ctx->active_count);
3042 }
3043
3044 struct i915_request *
3045 i915_gem_find_active_request(struct intel_engine_cs *engine)
3046 {
3047         struct i915_request *request, *active = NULL;
3048         unsigned long flags;
3049
3050         /*
3051          * We are called by the error capture, reset and to dump engine
3052          * state at random points in time. In particular, note that neither is
3053          * crucially ordered with an interrupt. After a hang, the GPU is dead
3054          * and we assume that no more writes can happen (we waited long enough
3055          * for all writes that were in transaction to be flushed) - adding an
3056          * extra delay for a recent interrupt is pointless. Hence, we do
3057          * not need an engine->irq_seqno_barrier() before the seqno reads.
3058          * At all other times, we must assume the GPU is still running, but
3059          * we only care about the snapshot of this moment.
3060          */
3061         spin_lock_irqsave(&engine->timeline.lock, flags);
3062         list_for_each_entry(request, &engine->timeline.requests, link) {
3063                 if (__i915_request_completed(request, request->global_seqno))
3064                         continue;
3065
3066                 active = request;
3067                 break;
3068         }
3069         spin_unlock_irqrestore(&engine->timeline.lock, flags);
3070
3071         return active;
3072 }
3073
3074 /*
3075  * Ensure irq handler finishes, and not run again.
3076  * Also return the active request so that we only search for it once.
3077  */
3078 struct i915_request *
3079 i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
3080 {
3081         struct i915_request *request;
3082
3083         /*
3084          * During the reset sequence, we must prevent the engine from
3085          * entering RC6. As the context state is undefined until we restart
3086          * the engine, if it does enter RC6 during the reset, the state
3087          * written to the powercontext is undefined and so we may lose
3088          * GPU state upon resume, i.e. fail to restart after a reset.
3089          */
3090         intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
3091
3092         request = engine->reset.prepare(engine);
3093         if (request && request->fence.error == -EIO)
3094                 request = ERR_PTR(-EIO); /* Previous reset failed! */
3095
3096         return request;
3097 }
3098
3099 int i915_gem_reset_prepare(struct drm_i915_private *dev_priv)
3100 {
3101         struct intel_engine_cs *engine;
3102         struct i915_request *request;
3103         enum intel_engine_id id;
3104         int err = 0;
3105
3106         for_each_engine(engine, dev_priv, id) {
3107                 request = i915_gem_reset_prepare_engine(engine);
3108                 if (IS_ERR(request)) {
3109                         err = PTR_ERR(request);
3110                         continue;
3111                 }
3112
3113                 engine->hangcheck.active_request = request;
3114         }
3115
3116         i915_gem_revoke_fences(dev_priv);
3117         intel_uc_sanitize(dev_priv);
3118
3119         return err;
3120 }
3121
3122 static void engine_skip_context(struct i915_request *request)
3123 {
3124         struct intel_engine_cs *engine = request->engine;
3125         struct i915_gem_context *hung_ctx = request->gem_context;
3126         struct i915_timeline *timeline = request->timeline;
3127         unsigned long flags;
3128
3129         GEM_BUG_ON(timeline == &engine->timeline);
3130
3131         spin_lock_irqsave(&engine->timeline.lock, flags);
3132         spin_lock(&timeline->lock);
3133
3134         list_for_each_entry_continue(request, &engine->timeline.requests, link)
3135                 if (request->gem_context == hung_ctx)
3136                         i915_request_skip(request, -EIO);
3137
3138         list_for_each_entry(request, &timeline->requests, link)
3139                 i915_request_skip(request, -EIO);
3140
3141         spin_unlock(&timeline->lock);
3142         spin_unlock_irqrestore(&engine->timeline.lock, flags);
3143 }
3144
3145 /* Returns the request if it was guilty of the hang */
3146 static struct i915_request *
3147 i915_gem_reset_request(struct intel_engine_cs *engine,
3148                        struct i915_request *request,
3149                        bool stalled)
3150 {
3151         /* The guilty request will get skipped on a hung engine.
3152          *
3153          * Users of client default contexts do not rely on logical
3154          * state preserved between batches so it is safe to execute
3155          * queued requests following the hang. Non default contexts
3156          * rely on preserved state, so skipping a batch loses the
3157          * evolution of the state and it needs to be considered corrupted.
3158          * Executing more queued batches on top of corrupted state is
3159          * risky. But we take the risk by trying to advance through
3160          * the queued requests in order to make the client behaviour
3161          * more predictable around resets, by not throwing away random
3162          * amount of batches it has prepared for execution. Sophisticated
3163          * clients can use gem_reset_stats_ioctl and dma fence status
3164          * (exported via sync_file info ioctl on explicit fences) to observe
3165          * when it loses the context state and should rebuild accordingly.
3166          *
3167          * The context ban, and ultimately the client ban, mechanism are safety
3168          * valves if client submission ends up resulting in nothing more than
3169          * subsequent hangs.
3170          */
3171
3172         if (i915_request_completed(request)) {
3173                 GEM_TRACE("%s pardoned global=%d (fence %llx:%d), current %d\n",
3174                           engine->name, request->global_seqno,
3175                           request->fence.context, request->fence.seqno,
3176                           intel_engine_get_seqno(engine));
3177                 stalled = false;
3178         }
3179
3180         if (stalled) {
3181                 i915_gem_context_mark_guilty(request->gem_context);
3182                 i915_request_skip(request, -EIO);
3183
3184                 /* If this context is now banned, skip all pending requests. */
3185                 if (i915_gem_context_is_banned(request->gem_context))
3186                         engine_skip_context(request);
3187         } else {
3188                 /*
3189                  * Since this is not the hung engine, it may have advanced
3190                  * since the hang declaration. Double check by refinding
3191                  * the active request at the time of the reset.
3192                  */
3193                 request = i915_gem_find_active_request(engine);
3194                 if (request) {
3195                         unsigned long flags;
3196
3197                         i915_gem_context_mark_innocent(request->gem_context);
3198                         dma_fence_set_error(&request->fence, -EAGAIN);
3199
3200                         /* Rewind the engine to replay the incomplete rq */
3201                         spin_lock_irqsave(&engine->timeline.lock, flags);
3202                         request = list_prev_entry(request, link);
3203                         if (&request->link == &engine->timeline.requests)
3204                                 request = NULL;
3205                         spin_unlock_irqrestore(&engine->timeline.lock, flags);
3206                 }
3207         }
3208
3209         return request;
3210 }
3211
3212 void i915_gem_reset_engine(struct intel_engine_cs *engine,
3213                            struct i915_request *request,
3214                            bool stalled)
3215 {
3216         /*
3217          * Make sure this write is visible before we re-enable the interrupt
3218          * handlers on another CPU, as tasklet_enable() resolves to just
3219          * a compiler barrier which is insufficient for our purpose here.
3220          */
3221         smp_store_mb(engine->irq_posted, 0);
3222
3223         if (request)
3224                 request = i915_gem_reset_request(engine, request, stalled);
3225
3226         /* Setup the CS to resume from the breadcrumb of the hung request */
3227         engine->reset.reset(engine, request);
3228 }
3229
3230 void i915_gem_reset(struct drm_i915_private *dev_priv,
3231                     unsigned int stalled_mask)
3232 {
3233         struct intel_engine_cs *engine;
3234         enum intel_engine_id id;
3235
3236         lockdep_assert_held(&dev_priv->drm.struct_mutex);
3237
3238         i915_retire_requests(dev_priv);
3239
3240         for_each_engine(engine, dev_priv, id) {
3241                 struct intel_context *ce;
3242
3243                 i915_gem_reset_engine(engine,
3244                                       engine->hangcheck.active_request,
3245                                       stalled_mask & ENGINE_MASK(id));
3246                 ce = fetch_and_zero(&engine->last_retired_context);
3247                 if (ce)
3248                         intel_context_unpin(ce);
3249
3250                 /*
3251                  * Ostensibily, we always want a context loaded for powersaving,
3252                  * so if the engine is idle after the reset, send a request
3253                  * to load our scratch kernel_context.
3254                  *
3255                  * More mysteriously, if we leave the engine idle after a reset,
3256                  * the next userspace batch may hang, with what appears to be
3257                  * an incoherent read by the CS (presumably stale TLB). An
3258                  * empty request appears sufficient to paper over the glitch.
3259                  */
3260                 if (intel_engine_is_idle(engine)) {
3261                         struct i915_request *rq;
3262
3263                         rq = i915_request_alloc(engine,
3264                                                 dev_priv->kernel_context);
3265                         if (!IS_ERR(rq))
3266                                 i915_request_add(rq);
3267                 }
3268         }
3269
3270         i915_gem_restore_fences(dev_priv);
3271 }
3272
3273 void i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
3274 {
3275         engine->reset.finish(engine);
3276
3277         intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
3278 }
3279
3280 void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
3281 {
3282         struct intel_engine_cs *engine;
3283         enum intel_engine_id id;
3284
3285         lockdep_assert_held(&dev_priv->drm.struct_mutex);
3286
3287         for_each_engine(engine, dev_priv, id) {
3288                 engine->hangcheck.active_request = NULL;
3289                 i915_gem_reset_finish_engine(engine);
3290         }
3291 }
3292
3293 static void nop_submit_request(struct i915_request *request)
3294 {
3295         GEM_TRACE("%s fence %llx:%d -> -EIO\n",
3296                   request->engine->name,
3297                   request->fence.context, request->fence.seqno);
3298         dma_fence_set_error(&request->fence, -EIO);
3299
3300         i915_request_submit(request);
3301 }
3302
3303 static void nop_complete_submit_request(struct i915_request *request)
3304 {
3305         unsigned long flags;
3306
3307         GEM_TRACE("%s fence %llx:%d -> -EIO\n",
3308                   request->engine->name,
3309                   request->fence.context, request->fence.seqno);
3310         dma_fence_set_error(&request->fence, -EIO);
3311
3312         spin_lock_irqsave(&request->engine->timeline.lock, flags);
3313         __i915_request_submit(request);
3314         intel_engine_init_global_seqno(request->engine, request->global_seqno);
3315         spin_unlock_irqrestore(&request->engine->timeline.lock, flags);
3316 }
3317
3318 void i915_gem_set_wedged(struct drm_i915_private *i915)
3319 {
3320         struct intel_engine_cs *engine;
3321         enum intel_engine_id id;
3322
3323         GEM_TRACE("start\n");
3324
3325         if (GEM_SHOW_DEBUG()) {
3326                 struct drm_printer p = drm_debug_printer(__func__);
3327
3328                 for_each_engine(engine, i915, id)
3329                         intel_engine_dump(engine, &p, "%s\n", engine->name);
3330         }
3331
3332         set_bit(I915_WEDGED, &i915->gpu_error.flags);
3333         smp_mb__after_atomic();
3334
3335         /*
3336          * First, stop submission to hw, but do not yet complete requests by
3337          * rolling the global seqno forward (since this would complete requests
3338          * for which we haven't set the fence error to EIO yet).
3339          */
3340         for_each_engine(engine, i915, id) {
3341                 i915_gem_reset_prepare_engine(engine);
3342
3343                 engine->submit_request = nop_submit_request;
3344                 engine->schedule = NULL;
3345         }
3346         i915->caps.scheduler = 0;
3347
3348         /* Even if the GPU reset fails, it should still stop the engines */
3349         intel_gpu_reset(i915, ALL_ENGINES);
3350
3351         /*
3352          * Make sure no one is running the old callback before we proceed with
3353          * cancelling requests and resetting the completion tracking. Otherwise
3354          * we might submit a request to the hardware which never completes.
3355          */
3356         synchronize_rcu();
3357
3358         for_each_engine(engine, i915, id) {
3359                 /* Mark all executing requests as skipped */
3360                 engine->cancel_requests(engine);
3361
3362                 /*
3363                  * Only once we've force-cancelled all in-flight requests can we
3364                  * start to complete all requests.
3365                  */
3366                 engine->submit_request = nop_complete_submit_request;
3367         }
3368
3369         /*
3370          * Make sure no request can slip through without getting completed by
3371          * either this call here to intel_engine_init_global_seqno, or the one
3372          * in nop_complete_submit_request.
3373          */
3374         synchronize_rcu();
3375
3376         for_each_engine(engine, i915, id) {
3377                 unsigned long flags;
3378
3379                 /*
3380                  * Mark all pending requests as complete so that any concurrent
3381                  * (lockless) lookup doesn't try and wait upon the request as we
3382                  * reset it.
3383                  */
3384                 spin_lock_irqsave(&engine->timeline.lock, flags);
3385                 intel_engine_init_global_seqno(engine,
3386                                                intel_engine_last_submit(engine));
3387                 spin_unlock_irqrestore(&engine->timeline.lock, flags);
3388
3389                 i915_gem_reset_finish_engine(engine);
3390         }
3391
3392         GEM_TRACE("end\n");
3393
3394         wake_up_all(&i915->gpu_error.reset_queue);
3395 }
3396
3397 bool i915_gem_unset_wedged(struct drm_i915_private *i915)
3398 {
3399         struct i915_timeline *tl;
3400
3401         lockdep_assert_held(&i915->drm.struct_mutex);
3402         if (!test_bit(I915_WEDGED, &i915->gpu_error.flags))
3403                 return true;
3404
3405         GEM_TRACE("start\n");
3406
3407         /*
3408          * Before unwedging, make sure that all pending operations
3409          * are flushed and errored out - we may have requests waiting upon
3410          * third party fences. We marked all inflight requests as EIO, and
3411          * every execbuf since returned EIO, for consistency we want all
3412          * the currently pending requests to also be marked as EIO, which
3413          * is done inside our nop_submit_request - and so we must wait.
3414          *
3415          * No more can be submitted until we reset the wedged bit.
3416          */
3417         list_for_each_entry(tl, &i915->gt.timelines, link) {
3418                 struct i915_request *rq;
3419
3420                 rq = i915_gem_active_peek(&tl->last_request,
3421                                           &i915->drm.struct_mutex);
3422                 if (!rq)
3423                         continue;
3424
3425                 /*
3426                  * We can't use our normal waiter as we want to
3427                  * avoid recursively trying to handle the current
3428                  * reset. The basic dma_fence_default_wait() installs
3429                  * a callback for dma_fence_signal(), which is
3430                  * triggered by our nop handler (indirectly, the
3431                  * callback enables the signaler thread which is
3432                  * woken by the nop_submit_request() advancing the seqno
3433                  * and when the seqno passes the fence, the signaler
3434                  * then signals the fence waking us up).
3435                  */
3436                 if (dma_fence_default_wait(&rq->fence, true,
3437                                            MAX_SCHEDULE_TIMEOUT) < 0)
3438                         return false;
3439         }
3440         i915_retire_requests(i915);
3441         GEM_BUG_ON(i915->gt.active_requests);
3442
3443         /*
3444          * Undo nop_submit_request. We prevent all new i915 requests from
3445          * being queued (by disallowing execbuf whilst wedged) so having
3446          * waited for all active requests above, we know the system is idle
3447          * and do not have to worry about a thread being inside
3448          * engine->submit_request() as we swap over. So unlike installing
3449          * the nop_submit_request on reset, we can do this from normal
3450          * context and do not require stop_machine().
3451          */
3452         intel_engines_reset_default_submission(i915);
3453         i915_gem_contexts_lost(i915);
3454
3455         GEM_TRACE("end\n");
3456
3457         smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
3458         clear_bit(I915_WEDGED, &i915->gpu_error.flags);
3459
3460         return true;
3461 }
3462
3463 static void
3464 i915_gem_retire_work_handler(struct work_struct *work)
3465 {
3466         struct drm_i915_private *dev_priv =
3467                 container_of(work, typeof(*dev_priv), gt.retire_work.work);
3468         struct drm_device *dev = &dev_priv->drm;
3469
3470         /* Come back later if the device is busy... */
3471         if (mutex_trylock(&dev->struct_mutex)) {
3472                 i915_retire_requests(dev_priv);
3473                 mutex_unlock(&dev->struct_mutex);
3474         }
3475
3476         /*
3477          * Keep the retire handler running until we are finally idle.
3478          * We do not need to do this test under locking as in the worst-case
3479          * we queue the retire worker once too often.
3480          */
3481         if (READ_ONCE(dev_priv->gt.awake))
3482                 queue_delayed_work(dev_priv->wq,
3483                                    &dev_priv->gt.retire_work,
3484                                    round_jiffies_up_relative(HZ));
3485 }
3486
3487 static void shrink_caches(struct drm_i915_private *i915)
3488 {
3489         /*
3490          * kmem_cache_shrink() discards empty slabs and reorders partially
3491          * filled slabs to prioritise allocating from the mostly full slabs,
3492          * with the aim of reducing fragmentation.
3493          */
3494         kmem_cache_shrink(i915->priorities);
3495         kmem_cache_shrink(i915->dependencies);
3496         kmem_cache_shrink(i915->requests);
3497         kmem_cache_shrink(i915->luts);
3498         kmem_cache_shrink(i915->vmas);
3499         kmem_cache_shrink(i915->objects);
3500 }
3501
3502 struct sleep_rcu_work {
3503         union {
3504                 struct rcu_head rcu;
3505                 struct work_struct work;
3506         };
3507         struct drm_i915_private *i915;
3508         unsigned int epoch;
3509 };
3510
3511 static inline bool
3512 same_epoch(struct drm_i915_private *i915, unsigned int epoch)
3513 {
3514         /*
3515          * There is a small chance that the epoch wrapped since we started
3516          * sleeping. If we assume that epoch is at least a u32, then it will
3517          * take at least 2^32 * 100ms for it to wrap, or about 326 years.
3518          */
3519         return epoch == READ_ONCE(i915->gt.epoch);
3520 }
3521
3522 static void __sleep_work(struct work_struct *work)
3523 {
3524         struct sleep_rcu_work *s = container_of(work, typeof(*s), work);
3525         struct drm_i915_private *i915 = s->i915;
3526         unsigned int epoch = s->epoch;
3527
3528         kfree(s);
3529         if (same_epoch(i915, epoch))
3530                 shrink_caches(i915);
3531 }
3532
3533 static void __sleep_rcu(struct rcu_head *rcu)
3534 {
3535         struct sleep_rcu_work *s = container_of(rcu, typeof(*s), rcu);
3536         struct drm_i915_private *i915 = s->i915;
3537
3538         if (same_epoch(i915, s->epoch)) {
3539                 INIT_WORK(&s->work, __sleep_work);
3540                 queue_work(i915->wq, &s->work);
3541         } else {
3542                 kfree(s);
3543         }
3544 }
3545
3546 static inline bool
3547 new_requests_since_last_retire(const struct drm_i915_private *i915)
3548 {
3549         return (READ_ONCE(i915->gt.active_requests) ||
3550                 work_pending(&i915->gt.idle_work.work));
3551 }
3552
3553 static void assert_kernel_context_is_current(struct drm_i915_private *i915)
3554 {
3555         struct intel_engine_cs *engine;
3556         enum intel_engine_id id;
3557
3558         if (i915_terminally_wedged(&i915->gpu_error))
3559                 return;
3560
3561         GEM_BUG_ON(i915->gt.active_requests);
3562         for_each_engine(engine, i915, id) {
3563                 GEM_BUG_ON(__i915_gem_active_peek(&engine->timeline.last_request));
3564                 GEM_BUG_ON(engine->last_retired_context !=
3565                            to_intel_context(i915->kernel_context, engine));
3566         }
3567 }
3568
3569 static void
3570 i915_gem_idle_work_handler(struct work_struct *work)
3571 {
3572         struct drm_i915_private *dev_priv =
3573                 container_of(work, typeof(*dev_priv), gt.idle_work.work);
3574         unsigned int epoch = I915_EPOCH_INVALID;
3575         bool rearm_hangcheck;
3576
3577         if (!READ_ONCE(dev_priv->gt.awake))
3578                 return;
3579
3580         if (READ_ONCE(dev_priv->gt.active_requests))
3581                 return;
3582
3583         /*
3584          * Flush out the last user context, leaving only the pinned
3585          * kernel context resident. When we are idling on the kernel_context,
3586          * no more new requests (with a context switch) are emitted and we
3587          * can finally rest. A consequence is that the idle work handler is
3588          * always called at least twice before idling (and if the system is
3589          * idle that implies a round trip through the retire worker).
3590          */
3591         mutex_lock(&dev_priv->drm.struct_mutex);
3592         i915_gem_switch_to_kernel_context(dev_priv);
3593         mutex_unlock(&dev_priv->drm.struct_mutex);
3594
3595         GEM_TRACE("active_requests=%d (after switch-to-kernel-context)\n",
3596                   READ_ONCE(dev_priv->gt.active_requests));
3597
3598         /*
3599          * Wait for last execlists context complete, but bail out in case a
3600          * new request is submitted. As we don't trust the hardware, we
3601          * continue on if the wait times out. This is necessary to allow
3602          * the machine to suspend even if the hardware dies, and we will
3603          * try to recover in resume (after depriving the hardware of power,
3604          * it may be in a better mmod).
3605          */
3606         __wait_for(if (new_requests_since_last_retire(dev_priv)) return,
3607                    intel_engines_are_idle(dev_priv),
3608                    I915_IDLE_ENGINES_TIMEOUT * 1000,
3609                    10, 500);
3610
3611         rearm_hangcheck =
3612                 cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
3613
3614         if (!mutex_trylock(&dev_priv->drm.struct_mutex)) {
3615                 /* Currently busy, come back later */
3616                 mod_delayed_work(dev_priv->wq,
3617                                  &dev_priv->gt.idle_work,
3618                                  msecs_to_jiffies(50));
3619                 goto out_rearm;
3620         }
3621
3622         /*
3623          * New request retired after this work handler started, extend active
3624          * period until next instance of the work.
3625          */
3626         if (new_requests_since_last_retire(dev_priv))
3627                 goto out_unlock;
3628
3629         epoch = __i915_gem_park(dev_priv);
3630
3631         assert_kernel_context_is_current(dev_priv);
3632
3633         rearm_hangcheck = false;
3634 out_unlock:
3635         mutex_unlock(&dev_priv->drm.struct_mutex);
3636
3637 out_rearm:
3638         if (rearm_hangcheck) {
3639                 GEM_BUG_ON(!dev_priv->gt.awake);
3640                 i915_queue_hangcheck(dev_priv);
3641         }
3642
3643         /*
3644          * When we are idle, it is an opportune time to reap our caches.
3645          * However, we have many objects that utilise RCU and the ordered
3646          * i915->wq that this work is executing on. To try and flush any
3647          * pending frees now we are idle, we first wait for an RCU grace
3648          * period, and then queue a task (that will run last on the wq) to
3649          * shrink and re-optimize the caches.
3650          */
3651         if (same_epoch(dev_priv, epoch)) {
3652                 struct sleep_rcu_work *s = kmalloc(sizeof(*s), GFP_KERNEL);
3653                 if (s) {
3654                         s->i915 = dev_priv;
3655                         s->epoch = epoch;
3656                         call_rcu(&s->rcu, __sleep_rcu);
3657                 }
3658         }
3659 }
3660
3661 void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file)
3662 {
3663         struct drm_i915_private *i915 = to_i915(gem->dev);
3664         struct drm_i915_gem_object *obj = to_intel_bo(gem);
3665         struct drm_i915_file_private *fpriv = file->driver_priv;
3666         struct i915_lut_handle *lut, *ln;
3667
3668         mutex_lock(&i915->drm.struct_mutex);
3669
3670         list_for_each_entry_safe(lut, ln, &obj->lut_list, obj_link) {
3671                 struct i915_gem_context *ctx = lut->ctx;
3672                 struct i915_vma *vma;
3673
3674                 GEM_BUG_ON(ctx->file_priv == ERR_PTR(-EBADF));
3675                 if (ctx->file_priv != fpriv)
3676                         continue;
3677
3678                 vma = radix_tree_delete(&ctx->handles_vma, lut->handle);
3679                 GEM_BUG_ON(vma->obj != obj);
3680
3681                 /* We allow the process to have multiple handles to the same
3682                  * vma, in the same fd namespace, by virtue of flink/open.
3683                  */
3684                 GEM_BUG_ON(!vma->open_count);
3685                 if (!--vma->open_count && !i915_vma_is_ggtt(vma))
3686                         i915_vma_close(vma);
3687
3688                 list_del(&lut->obj_link);
3689                 list_del(&lut->ctx_link);
3690
3691                 kmem_cache_free(i915->luts, lut);
3692                 __i915_gem_object_release_unless_active(obj);
3693         }
3694
3695         mutex_unlock(&i915->drm.struct_mutex);
3696 }
3697
3698 static unsigned long to_wait_timeout(s64 timeout_ns)
3699 {
3700         if (timeout_ns < 0)
3701                 return MAX_SCHEDULE_TIMEOUT;
3702
3703         if (timeout_ns == 0)
3704                 return 0;
3705
3706         return nsecs_to_jiffies_timeout(timeout_ns);
3707 }
3708
3709 /**
3710  * i915_gem_wait_ioctl - implements DRM_IOCTL_I915_GEM_WAIT
3711  * @dev: drm device pointer
3712  * @data: ioctl data blob
3713  * @file: drm file pointer
3714  *
3715  * Returns 0 if successful, else an error is returned with the remaining time in
3716  * the timeout parameter.
3717  *  -ETIME: object is still busy after timeout
3718  *  -ERESTARTSYS: signal interrupted the wait
3719  *  -ENONENT: object doesn't exist
3720  * Also possible, but rare:
3721  *  -EAGAIN: incomplete, restart syscall
3722  *  -ENOMEM: damn
3723  *  -ENODEV: Internal IRQ fail
3724  *  -E?: The add request failed
3725  *
3726  * The wait ioctl with a timeout of 0 reimplements the busy ioctl. With any
3727  * non-zero timeout parameter the wait ioctl will wait for the given number of
3728  * nanoseconds on an object becoming unbusy. Since the wait itself does so
3729  * without holding struct_mutex the object may become re-busied before this
3730  * function completes. A similar but shorter * race condition exists in the busy
3731  * ioctl
3732  */
3733 int
3734 i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3735 {
3736         struct drm_i915_gem_wait *args = data;
3737         struct drm_i915_gem_object *obj;
3738         ktime_t start;
3739         long ret;
3740
3741         if (args->flags != 0)
3742                 return -EINVAL;
3743
3744         obj = i915_gem_object_lookup(file, args->bo_handle);
3745         if (!obj)
3746                 return -ENOENT;
3747
3748         start = ktime_get();
3749
3750         ret = i915_gem_object_wait(obj,
3751                                    I915_WAIT_INTERRUPTIBLE | I915_WAIT_ALL,
3752                                    to_wait_timeout(args->timeout_ns),
3753                                    to_rps_client(file));
3754
3755         if (args->timeout_ns > 0) {
3756                 args->timeout_ns -= ktime_to_ns(ktime_sub(ktime_get(), start));
3757                 if (args->timeout_ns < 0)
3758                         args->timeout_ns = 0;
3759
3760                 /*
3761                  * Apparently ktime isn't accurate enough and occasionally has a
3762                  * bit of mismatch in the jiffies<->nsecs<->ktime loop. So patch
3763                  * things up to make the test happy. We allow up to 1 jiffy.
3764                  *
3765                  * This is a regression from the timespec->ktime conversion.
3766                  */
3767                 if (ret == -ETIME && !nsecs_to_jiffies(args->timeout_ns))
3768                         args->timeout_ns = 0;
3769
3770                 /* Asked to wait beyond the jiffie/scheduler precision? */
3771                 if (ret == -ETIME && args->timeout_ns)
3772                         ret = -EAGAIN;
3773         }
3774
3775         i915_gem_object_put(obj);
3776         return ret;
3777 }
3778
3779 static long wait_for_timeline(struct i915_timeline *tl,
3780                               unsigned int flags, long timeout)
3781 {
3782         struct i915_request *rq;
3783
3784         rq = i915_gem_active_get_unlocked(&tl->last_request);
3785         if (!rq)
3786                 return timeout;
3787
3788         /*
3789          * "Race-to-idle".
3790          *
3791          * Switching to the kernel context is often used a synchronous
3792          * step prior to idling, e.g. in suspend for flushing all
3793          * current operations to memory before sleeping. These we
3794          * want to complete as quickly as possible to avoid prolonged
3795          * stalls, so allow the gpu to boost to maximum clocks.
3796          */
3797         if (flags & I915_WAIT_FOR_IDLE_BOOST)
3798                 gen6_rps_boost(rq, NULL);
3799
3800         timeout = i915_request_wait(rq, flags, timeout);
3801         i915_request_put(rq);
3802
3803         return timeout;
3804 }
3805
3806 static int wait_for_engines(struct drm_i915_private *i915)
3807 {
3808         if (wait_for(intel_engines_are_idle(i915), I915_IDLE_ENGINES_TIMEOUT)) {
3809                 dev_err(i915->drm.dev,
3810                         "Failed to idle engines, declaring wedged!\n");
3811                 GEM_TRACE_DUMP();
3812                 i915_gem_set_wedged(i915);
3813                 return -EIO;
3814         }
3815
3816         return 0;
3817 }
3818
3819 int i915_gem_wait_for_idle(struct drm_i915_private *i915,
3820                            unsigned int flags, long timeout)
3821 {
3822         GEM_TRACE("flags=%x (%s), timeout=%ld%s\n",
3823                   flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked",
3824                   timeout, timeout == MAX_SCHEDULE_TIMEOUT ? " (forever)" : "");
3825
3826         /* If the device is asleep, we have no requests outstanding */
3827         if (!READ_ONCE(i915->gt.awake))
3828                 return 0;
3829
3830         if (flags & I915_WAIT_LOCKED) {
3831                 struct i915_timeline *tl;
3832                 int err;
3833
3834                 lockdep_assert_held(&i915->drm.struct_mutex);
3835
3836                 list_for_each_entry(tl, &i915->gt.timelines, link) {
3837                         timeout = wait_for_timeline(tl, flags, timeout);
3838                         if (timeout < 0)
3839                                 return timeout;
3840                 }
3841
3842                 err = wait_for_engines(i915);
3843                 if (err)
3844                         return err;
3845
3846                 i915_retire_requests(i915);
3847                 GEM_BUG_ON(i915->gt.active_requests);
3848         } else {
3849                 struct intel_engine_cs *engine;
3850                 enum intel_engine_id id;
3851
3852                 for_each_engine(engine, i915, id) {
3853                         struct i915_timeline *tl = &engine->timeline;
3854
3855                         timeout = wait_for_timeline(tl, flags, timeout);
3856                         if (timeout < 0)
3857                                 return timeout;
3858                 }
3859         }
3860
3861         return 0;
3862 }
3863
3864 static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
3865 {
3866         /*
3867          * We manually flush the CPU domain so that we can override and
3868          * force the flush for the display, and perform it asyncrhonously.
3869          */
3870         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
3871         if (obj->cache_dirty)
3872                 i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
3873         obj->write_domain = 0;
3874 }
3875
3876 void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj)
3877 {
3878         if (!READ_ONCE(obj->pin_global))
3879                 return;
3880
3881         mutex_lock(&obj->base.dev->struct_mutex);
3882         __i915_gem_object_flush_for_display(obj);
3883         mutex_unlock(&obj->base.dev->struct_mutex);
3884 }
3885
3886 /**
3887  * Moves a single object to the WC read, and possibly write domain.
3888  * @obj: object to act on
3889  * @write: ask for write access or read only
3890  *
3891  * This function returns when the move is complete, including waiting on
3892  * flushes to occur.
3893  */
3894 int
3895 i915_gem_object_set_to_wc_domain(struct drm_i915_gem_object *obj, bool write)
3896 {
3897         int ret;
3898
3899         lockdep_assert_held(&obj->base.dev->struct_mutex);
3900
3901         ret = i915_gem_object_wait(obj,
3902                                    I915_WAIT_INTERRUPTIBLE |
3903                                    I915_WAIT_LOCKED |
3904                                    (write ? I915_WAIT_ALL : 0),
3905                                    MAX_SCHEDULE_TIMEOUT,
3906                                    NULL);
3907         if (ret)
3908                 return ret;
3909
3910         if (obj->write_domain == I915_GEM_DOMAIN_WC)
3911                 return 0;
3912
3913         /* Flush and acquire obj->pages so that we are coherent through
3914          * direct access in memory with previous cached writes through
3915          * shmemfs and that our cache domain tracking remains valid.
3916          * For example, if the obj->filp was moved to swap without us
3917          * being notified and releasing the pages, we would mistakenly
3918          * continue to assume that the obj remained out of the CPU cached
3919          * domain.
3920          */
3921         ret = i915_gem_object_pin_pages(obj);
3922         if (ret)
3923                 return ret;
3924
3925         flush_write_domain(obj, ~I915_GEM_DOMAIN_WC);
3926
3927         /* Serialise direct access to this object with the barriers for
3928          * coherent writes from the GPU, by effectively invalidating the
3929          * WC domain upon first access.
3930          */
3931         if ((obj->read_domains & I915_GEM_DOMAIN_WC) == 0)
3932                 mb();
3933
3934         /* It should now be out of any other write domains, and we can update
3935          * the domain values for our changes.
3936          */
3937         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_WC) != 0);
3938         obj->read_domains |= I915_GEM_DOMAIN_WC;
3939         if (write) {
3940                 obj->read_domains = I915_GEM_DOMAIN_WC;
3941                 obj->write_domain = I915_GEM_DOMAIN_WC;
3942                 obj->mm.dirty = true;
3943         }
3944
3945         i915_gem_object_unpin_pages(obj);
3946         return 0;
3947 }
3948
3949 /**
3950  * Moves a single object to the GTT read, and possibly write domain.
3951  * @obj: object to act on
3952  * @write: ask for write access or read only
3953  *
3954  * This function returns when the move is complete, including waiting on
3955  * flushes to occur.
3956  */
3957 int
3958 i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
3959 {
3960         int ret;
3961
3962         lockdep_assert_held(&obj->base.dev->struct_mutex);
3963
3964         ret = i915_gem_object_wait(obj,
3965                                    I915_WAIT_INTERRUPTIBLE |
3966                                    I915_WAIT_LOCKED |
3967                                    (write ? I915_WAIT_ALL : 0),
3968                                    MAX_SCHEDULE_TIMEOUT,
3969                                    NULL);
3970         if (ret)
3971                 return ret;
3972
3973         if (obj->write_domain == I915_GEM_DOMAIN_GTT)
3974                 return 0;
3975
3976         /* Flush and acquire obj->pages so that we are coherent through
3977          * direct access in memory with previous cached writes through
3978          * shmemfs and that our cache domain tracking remains valid.
3979          * For example, if the obj->filp was moved to swap without us
3980          * being notified and releasing the pages, we would mistakenly
3981          * continue to assume that the obj remained out of the CPU cached
3982          * domain.
3983          */
3984         ret = i915_gem_object_pin_pages(obj);
3985         if (ret)
3986                 return ret;
3987
3988         flush_write_domain(obj, ~I915_GEM_DOMAIN_GTT);
3989
3990         /* Serialise direct access to this object with the barriers for
3991          * coherent writes from the GPU, by effectively invalidating the
3992          * GTT domain upon first access.
3993          */
3994         if ((obj->read_domains & I915_GEM_DOMAIN_GTT) == 0)
3995                 mb();
3996
3997         /* It should now be out of any other write domains, and we can update
3998          * the domain values for our changes.
3999          */
4000         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_GTT) != 0);
4001         obj->read_domains |= I915_GEM_DOMAIN_GTT;
4002         if (write) {
4003                 obj->read_domains = I915_GEM_DOMAIN_GTT;
4004                 obj->write_domain = I915_GEM_DOMAIN_GTT;
4005                 obj->mm.dirty = true;
4006         }
4007
4008         i915_gem_object_unpin_pages(obj);
4009         return 0;
4010 }
4011
4012 /**
4013  * Changes the cache-level of an object across all VMA.
4014  * @obj: object to act on
4015  * @cache_level: new cache level to set for the object
4016  *
4017  * After this function returns, the object will be in the new cache-level
4018  * across all GTT and the contents of the backing storage will be coherent,
4019  * with respect to the new cache-level. In order to keep the backing storage
4020  * coherent for all users, we only allow a single cache level to be set
4021  * globally on the object and prevent it from being changed whilst the
4022  * hardware is reading from the object. That is if the object is currently
4023  * on the scanout it will be set to uncached (or equivalent display
4024  * cache coherency) and all non-MOCS GPU access will also be uncached so
4025  * that all direct access to the scanout remains coherent.
4026  */
4027 int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
4028                                     enum i915_cache_level cache_level)
4029 {
4030         struct i915_vma *vma;
4031         int ret;
4032
4033         lockdep_assert_held(&obj->base.dev->struct_mutex);
4034
4035         if (obj->cache_level == cache_level)
4036                 return 0;
4037
4038         /* Inspect the list of currently bound VMA and unbind any that would
4039          * be invalid given the new cache-level. This is principally to
4040          * catch the issue of the CS prefetch crossing page boundaries and
4041          * reading an invalid PTE on older architectures.
4042          */
4043 restart:
4044         list_for_each_entry(vma, &obj->vma_list, obj_link) {
4045                 if (!drm_mm_node_allocated(&vma->node))
4046                         continue;
4047
4048                 if (i915_vma_is_pinned(vma)) {
4049                         DRM_DEBUG("can not change the cache level of pinned objects\n");
4050                         return -EBUSY;
4051                 }
4052
4053                 if (!i915_vma_is_closed(vma) &&
4054                     i915_gem_valid_gtt_space(vma, cache_level))
4055                         continue;
4056
4057                 ret = i915_vma_unbind(vma);
4058                 if (ret)
4059                         return ret;
4060
4061                 /* As unbinding may affect other elements in the
4062                  * obj->vma_list (due to side-effects from retiring
4063                  * an active vma), play safe and restart the iterator.
4064                  */
4065                 goto restart;
4066         }
4067
4068         /* We can reuse the existing drm_mm nodes but need to change the
4069          * cache-level on the PTE. We could simply unbind them all and
4070          * rebind with the correct cache-level on next use. However since
4071          * we already have a valid slot, dma mapping, pages etc, we may as
4072          * rewrite the PTE in the belief that doing so tramples upon less
4073          * state and so involves less work.
4074          */
4075         if (obj->bind_count) {
4076                 /* Before we change the PTE, the GPU must not be accessing it.
4077                  * If we wait upon the object, we know that all the bound
4078                  * VMA are no longer active.
4079                  */
4080                 ret = i915_gem_object_wait(obj,
4081                                            I915_WAIT_INTERRUPTIBLE |
4082                                            I915_WAIT_LOCKED |
4083                                            I915_WAIT_ALL,
4084                                            MAX_SCHEDULE_TIMEOUT,
4085                                            NULL);
4086                 if (ret)
4087                         return ret;
4088
4089                 if (!HAS_LLC(to_i915(obj->base.dev)) &&
4090                     cache_level != I915_CACHE_NONE) {
4091                         /* Access to snoopable pages through the GTT is
4092                          * incoherent and on some machines causes a hard
4093                          * lockup. Relinquish the CPU mmaping to force
4094                          * userspace to refault in the pages and we can
4095                          * then double check if the GTT mapping is still
4096                          * valid for that pointer access.
4097                          */
4098                         i915_gem_release_mmap(obj);
4099
4100                         /* As we no longer need a fence for GTT access,
4101                          * we can relinquish it now (and so prevent having
4102                          * to steal a fence from someone else on the next
4103                          * fence request). Note GPU activity would have
4104                          * dropped the fence as all snoopable access is
4105                          * supposed to be linear.
4106                          */
4107                         for_each_ggtt_vma(vma, obj) {
4108                                 ret = i915_vma_put_fence(vma);
4109                                 if (ret)
4110                                         return ret;
4111                         }
4112                 } else {
4113                         /* We either have incoherent backing store and
4114                          * so no GTT access or the architecture is fully
4115                          * coherent. In such cases, existing GTT mmaps
4116                          * ignore the cache bit in the PTE and we can
4117                          * rewrite it without confusing the GPU or having
4118                          * to force userspace to fault back in its mmaps.
4119                          */
4120                 }
4121
4122                 list_for_each_entry(vma, &obj->vma_list, obj_link) {
4123                         if (!drm_mm_node_allocated(&vma->node))
4124                                 continue;
4125
4126                         ret = i915_vma_bind(vma, cache_level, PIN_UPDATE);
4127                         if (ret)
4128                                 return ret;
4129                 }
4130         }
4131
4132         list_for_each_entry(vma, &obj->vma_list, obj_link)
4133                 vma->node.color = cache_level;
4134         i915_gem_object_set_cache_coherency(obj, cache_level);
4135         obj->cache_dirty = true; /* Always invalidate stale cachelines */
4136
4137         return 0;
4138 }
4139
4140 int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
4141                                struct drm_file *file)
4142 {
4143         struct drm_i915_gem_caching *args = data;
4144         struct drm_i915_gem_object *obj;
4145         int err = 0;
4146
4147         rcu_read_lock();
4148         obj = i915_gem_object_lookup_rcu(file, args->handle);
4149         if (!obj) {
4150                 err = -ENOENT;
4151                 goto out;
4152         }
4153
4154         switch (obj->cache_level) {
4155         case I915_CACHE_LLC:
4156         case I915_CACHE_L3_LLC:
4157                 args->caching = I915_CACHING_CACHED;
4158                 break;
4159
4160         case I915_CACHE_WT:
4161                 args->caching = I915_CACHING_DISPLAY;
4162                 break;
4163
4164         default:
4165                 args->caching = I915_CACHING_NONE;
4166                 break;
4167         }
4168 out:
4169         rcu_read_unlock();
4170         return err;
4171 }
4172
4173 int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
4174                                struct drm_file *file)
4175 {
4176         struct drm_i915_private *i915 = to_i915(dev);
4177         struct drm_i915_gem_caching *args = data;
4178         struct drm_i915_gem_object *obj;
4179         enum i915_cache_level level;
4180         int ret = 0;
4181
4182         switch (args->caching) {
4183         case I915_CACHING_NONE:
4184                 level = I915_CACHE_NONE;
4185                 break;
4186         case I915_CACHING_CACHED:
4187                 /*
4188                  * Due to a HW issue on BXT A stepping, GPU stores via a
4189                  * snooped mapping may leave stale data in a corresponding CPU
4190                  * cacheline, whereas normally such cachelines would get
4191                  * invalidated.
4192                  */
4193                 if (!HAS_LLC(i915) && !HAS_SNOOP(i915))
4194                         return -ENODEV;
4195
4196                 level = I915_CACHE_LLC;
4197                 break;
4198         case I915_CACHING_DISPLAY:
4199                 level = HAS_WT(i915) ? I915_CACHE_WT : I915_CACHE_NONE;
4200                 break;
4201         default:
4202                 return -EINVAL;
4203         }
4204
4205         obj = i915_gem_object_lookup(file, args->handle);
4206         if (!obj)
4207                 return -ENOENT;
4208
4209         /*
4210          * The caching mode of proxy object is handled by its generator, and
4211          * not allowed to be changed by userspace.
4212          */
4213         if (i915_gem_object_is_proxy(obj)) {
4214                 ret = -ENXIO;
4215                 goto out;
4216         }
4217
4218         if (obj->cache_level == level)
4219                 goto out;
4220
4221         ret = i915_gem_object_wait(obj,
4222                                    I915_WAIT_INTERRUPTIBLE,
4223                                    MAX_SCHEDULE_TIMEOUT,
4224                                    to_rps_client(file));
4225         if (ret)
4226                 goto out;
4227
4228         ret = i915_mutex_lock_interruptible(dev);
4229         if (ret)
4230                 goto out;
4231
4232         ret = i915_gem_object_set_cache_level(obj, level);
4233         mutex_unlock(&dev->struct_mutex);
4234
4235 out:
4236         i915_gem_object_put(obj);
4237         return ret;
4238 }
4239
4240 /*
4241  * Prepare buffer for display plane (scanout, cursors, etc). Can be called from
4242  * an uninterruptible phase (modesetting) and allows any flushes to be pipelined
4243  * (for pageflips). We only flush the caches while preparing the buffer for
4244  * display, the callers are responsible for frontbuffer flush.
4245  */
4246 struct i915_vma *
4247 i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
4248                                      u32 alignment,
4249                                      const struct i915_ggtt_view *view,
4250                                      unsigned int flags)
4251 {
4252         struct i915_vma *vma;
4253         int ret;
4254
4255         lockdep_assert_held(&obj->base.dev->struct_mutex);
4256
4257         /* Mark the global pin early so that we account for the
4258          * display coherency whilst setting up the cache domains.
4259          */
4260         obj->pin_global++;
4261
4262         /* The display engine is not coherent with the LLC cache on gen6.  As
4263          * a result, we make sure that the pinning that is about to occur is
4264          * done with uncached PTEs. This is lowest common denominator for all
4265          * chipsets.
4266          *
4267          * However for gen6+, we could do better by using the GFDT bit instead
4268          * of uncaching, which would allow us to flush all the LLC-cached data
4269          * with that bit in the PTE to main memory with just one PIPE_CONTROL.
4270          */
4271         ret = i915_gem_object_set_cache_level(obj,
4272                                               HAS_WT(to_i915(obj->base.dev)) ?
4273                                               I915_CACHE_WT : I915_CACHE_NONE);
4274         if (ret) {
4275                 vma = ERR_PTR(ret);
4276                 goto err_unpin_global;
4277         }
4278
4279         /* As the user may map the buffer once pinned in the display plane
4280          * (e.g. libkms for the bootup splash), we have to ensure that we
4281          * always use map_and_fenceable for all scanout buffers. However,
4282          * it may simply be too big to fit into mappable, in which case
4283          * put it anyway and hope that userspace can cope (but always first
4284          * try to preserve the existing ABI).
4285          */
4286         vma = ERR_PTR(-ENOSPC);
4287         if ((flags & PIN_MAPPABLE) == 0 &&
4288             (!view || view->type == I915_GGTT_VIEW_NORMAL))
4289                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment,
4290                                                flags |
4291                                                PIN_MAPPABLE |
4292                                                PIN_NONBLOCK);
4293         if (IS_ERR(vma))
4294                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment, flags);
4295         if (IS_ERR(vma))
4296                 goto err_unpin_global;
4297
4298         vma->display_alignment = max_t(u64, vma->display_alignment, alignment);
4299
4300         __i915_gem_object_flush_for_display(obj);
4301
4302         /* It should now be out of any other write domains, and we can update
4303          * the domain values for our changes.
4304          */
4305         obj->read_domains |= I915_GEM_DOMAIN_GTT;
4306
4307         return vma;
4308
4309 err_unpin_global:
4310         obj->pin_global--;
4311         return vma;
4312 }
4313
4314 void
4315 i915_gem_object_unpin_from_display_plane(struct i915_vma *vma)
4316 {
4317         lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
4318
4319         if (WARN_ON(vma->obj->pin_global == 0))
4320                 return;
4321
4322         if (--vma->obj->pin_global == 0)
4323                 vma->display_alignment = I915_GTT_MIN_ALIGNMENT;
4324
4325         /* Bump the LRU to try and avoid premature eviction whilst flipping  */
4326         i915_gem_object_bump_inactive_ggtt(vma->obj);
4327
4328         i915_vma_unpin(vma);
4329 }
4330
4331 /**
4332  * Moves a single object to the CPU read, and possibly write domain.
4333  * @obj: object to act on
4334  * @write: requesting write or read-only access
4335  *
4336  * This function returns when the move is complete, including waiting on
4337  * flushes to occur.
4338  */
4339 int
4340 i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
4341 {
4342         int ret;
4343
4344         lockdep_assert_held(&obj->base.dev->struct_mutex);
4345
4346         ret = i915_gem_object_wait(obj,
4347                                    I915_WAIT_INTERRUPTIBLE |
4348                                    I915_WAIT_LOCKED |
4349                                    (write ? I915_WAIT_ALL : 0),
4350                                    MAX_SCHEDULE_TIMEOUT,
4351                                    NULL);
4352         if (ret)
4353                 return ret;
4354
4355         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
4356
4357         /* Flush the CPU cache if it's still invalid. */
4358         if ((obj->read_domains & I915_GEM_DOMAIN_CPU) == 0) {
4359                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
4360                 obj->read_domains |= I915_GEM_DOMAIN_CPU;
4361         }
4362
4363         /* It should now be out of any other write domains, and we can update
4364          * the domain values for our changes.
4365          */
4366         GEM_BUG_ON(obj->write_domain & ~I915_GEM_DOMAIN_CPU);
4367
4368         /* If we're writing through the CPU, then the GPU read domains will
4369          * need to be invalidated at next use.
4370          */
4371         if (write)
4372                 __start_cpu_write(obj);
4373
4374         return 0;
4375 }
4376
4377 /* Throttle our rendering by waiting until the ring has completed our requests
4378  * emitted over 20 msec ago.
4379  *
4380  * Note that if we were to use the current jiffies each time around the loop,
4381  * we wouldn't escape the function with any frames outstanding if the time to
4382  * render a frame was over 20ms.
4383  *
4384  * This should get us reasonable parallelism between CPU and GPU but also
4385  * relatively low latency when blocking on a particular request to finish.
4386  */
4387 static int
4388 i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
4389 {
4390         struct drm_i915_private *dev_priv = to_i915(dev);
4391         struct drm_i915_file_private *file_priv = file->driver_priv;
4392         unsigned long recent_enough = jiffies - DRM_I915_THROTTLE_JIFFIES;
4393         struct i915_request *request, *target = NULL;
4394         long ret;
4395
4396         /* ABI: return -EIO if already wedged */
4397         if (i915_terminally_wedged(&dev_priv->gpu_error))
4398                 return -EIO;
4399
4400         spin_lock(&file_priv->mm.lock);
4401         list_for_each_entry(request, &file_priv->mm.request_list, client_link) {
4402                 if (time_after_eq(request->emitted_jiffies, recent_enough))
4403                         break;
4404
4405                 if (target) {
4406                         list_del(&target->client_link);
4407                         target->file_priv = NULL;
4408                 }
4409
4410                 target = request;
4411         }
4412         if (target)
4413                 i915_request_get(target);
4414         spin_unlock(&file_priv->mm.lock);
4415
4416         if (target == NULL)
4417                 return 0;
4418
4419         ret = i915_request_wait(target,
4420                                 I915_WAIT_INTERRUPTIBLE,
4421                                 MAX_SCHEDULE_TIMEOUT);
4422         i915_request_put(target);
4423
4424         return ret < 0 ? ret : 0;
4425 }
4426
4427 struct i915_vma *
4428 i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
4429                          const struct i915_ggtt_view *view,
4430                          u64 size,
4431                          u64 alignment,
4432                          u64 flags)
4433 {
4434         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
4435         struct i915_address_space *vm = &dev_priv->ggtt.vm;
4436
4437         return i915_gem_object_pin(obj, vm, view, size, alignment,
4438                                    flags | PIN_GLOBAL);
4439 }
4440
4441 struct i915_vma *
4442 i915_gem_object_pin(struct drm_i915_gem_object *obj,
4443                     struct i915_address_space *vm,
4444                     const struct i915_ggtt_view *view,
4445                     u64 size,
4446                     u64 alignment,
4447                     u64 flags)
4448 {
4449         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
4450         struct i915_vma *vma;
4451         int ret;
4452
4453         lockdep_assert_held(&obj->base.dev->struct_mutex);
4454
4455         if (flags & PIN_MAPPABLE &&
4456             (!view || view->type == I915_GGTT_VIEW_NORMAL)) {
4457                 /* If the required space is larger than the available
4458                  * aperture, we will not able to find a slot for the
4459                  * object and unbinding the object now will be in
4460                  * vain. Worse, doing so may cause us to ping-pong
4461                  * the object in and out of the Global GTT and
4462                  * waste a lot of cycles under the mutex.
4463                  */
4464                 if (obj->base.size > dev_priv->ggtt.mappable_end)
4465                         return ERR_PTR(-E2BIG);
4466
4467                 /* If NONBLOCK is set the caller is optimistically
4468                  * trying to cache the full object within the mappable
4469                  * aperture, and *must* have a fallback in place for
4470                  * situations where we cannot bind the object. We
4471                  * can be a little more lax here and use the fallback
4472                  * more often to avoid costly migrations of ourselves
4473                  * and other objects within the aperture.
4474                  *
4475                  * Half-the-aperture is used as a simple heuristic.
4476                  * More interesting would to do search for a free
4477                  * block prior to making the commitment to unbind.
4478                  * That caters for the self-harm case, and with a
4479                  * little more heuristics (e.g. NOFAULT, NOEVICT)
4480                  * we could try to minimise harm to others.
4481                  */
4482                 if (flags & PIN_NONBLOCK &&
4483                     obj->base.size > dev_priv->ggtt.mappable_end / 2)
4484                         return ERR_PTR(-ENOSPC);
4485         }
4486
4487         vma = i915_vma_instance(obj, vm, view);
4488         if (unlikely(IS_ERR(vma)))
4489                 return vma;
4490
4491         if (i915_vma_misplaced(vma, size, alignment, flags)) {
4492                 if (flags & PIN_NONBLOCK) {
4493                         if (i915_vma_is_pinned(vma) || i915_vma_is_active(vma))
4494                                 return ERR_PTR(-ENOSPC);
4495
4496                         if (flags & PIN_MAPPABLE &&
4497                             vma->fence_size > dev_priv->ggtt.mappable_end / 2)
4498                                 return ERR_PTR(-ENOSPC);
4499                 }
4500
4501                 WARN(i915_vma_is_pinned(vma),
4502                      "bo is already pinned in ggtt with incorrect alignment:"
4503                      " offset=%08x, req.alignment=%llx,"
4504                      " req.map_and_fenceable=%d, vma->map_and_fenceable=%d\n",
4505                      i915_ggtt_offset(vma), alignment,
4506                      !!(flags & PIN_MAPPABLE),
4507                      i915_vma_is_map_and_fenceable(vma));
4508                 ret = i915_vma_unbind(vma);
4509                 if (ret)
4510                         return ERR_PTR(ret);
4511         }
4512
4513         ret = i915_vma_pin(vma, size, alignment, flags);
4514         if (ret)
4515                 return ERR_PTR(ret);
4516
4517         return vma;
4518 }
4519
4520 static __always_inline unsigned int __busy_read_flag(unsigned int id)
4521 {
4522         /* Note that we could alias engines in the execbuf API, but
4523          * that would be very unwise as it prevents userspace from
4524          * fine control over engine selection. Ahem.
4525          *
4526          * This should be something like EXEC_MAX_ENGINE instead of
4527          * I915_NUM_ENGINES.
4528          */
4529         BUILD_BUG_ON(I915_NUM_ENGINES > 16);
4530         return 0x10000 << id;
4531 }
4532
4533 static __always_inline unsigned int __busy_write_id(unsigned int id)
4534 {
4535         /* The uABI guarantees an active writer is also amongst the read
4536          * engines. This would be true if we accessed the activity tracking
4537          * under the lock, but as we perform the lookup of the object and
4538          * its activity locklessly we can not guarantee that the last_write
4539          * being active implies that we have set the same engine flag from
4540          * last_read - hence we always set both read and write busy for
4541          * last_write.
4542          */
4543         return id | __busy_read_flag(id);
4544 }
4545
4546 static __always_inline unsigned int
4547 __busy_set_if_active(const struct dma_fence *fence,
4548                      unsigned int (*flag)(unsigned int id))
4549 {
4550         struct i915_request *rq;
4551
4552         /* We have to check the current hw status of the fence as the uABI
4553          * guarantees forward progress. We could rely on the idle worker
4554          * to eventually flush us, but to minimise latency just ask the
4555          * hardware.
4556          *
4557          * Note we only report on the status of native fences.
4558          */
4559         if (!dma_fence_is_i915(fence))
4560                 return 0;
4561
4562         /* opencode to_request() in order to avoid const warnings */
4563         rq = container_of(fence, struct i915_request, fence);
4564         if (i915_request_completed(rq))
4565                 return 0;
4566
4567         return flag(rq->engine->uabi_id);
4568 }
4569
4570 static __always_inline unsigned int
4571 busy_check_reader(const struct dma_fence *fence)
4572 {
4573         return __busy_set_if_active(fence, __busy_read_flag);
4574 }
4575
4576 static __always_inline unsigned int
4577 busy_check_writer(const struct dma_fence *fence)
4578 {
4579         if (!fence)
4580                 return 0;
4581
4582         return __busy_set_if_active(fence, __busy_write_id);
4583 }
4584
4585 int
4586 i915_gem_busy_ioctl(struct drm_device *dev, void *data,
4587                     struct drm_file *file)
4588 {
4589         struct drm_i915_gem_busy *args = data;
4590         struct drm_i915_gem_object *obj;
4591         struct reservation_object_list *list;
4592         unsigned int seq;
4593         int err;
4594
4595         err = -ENOENT;
4596         rcu_read_lock();
4597         obj = i915_gem_object_lookup_rcu(file, args->handle);
4598         if (!obj)
4599                 goto out;
4600
4601         /* A discrepancy here is that we do not report the status of
4602          * non-i915 fences, i.e. even though we may report the object as idle,
4603          * a call to set-domain may still stall waiting for foreign rendering.
4604          * This also means that wait-ioctl may report an object as busy,
4605          * where busy-ioctl considers it idle.
4606          *
4607          * We trade the ability to warn of foreign fences to report on which
4608          * i915 engines are active for the object.
4609          *
4610          * Alternatively, we can trade that extra information on read/write
4611          * activity with
4612          *      args->busy =
4613          *              !reservation_object_test_signaled_rcu(obj->resv, true);
4614          * to report the overall busyness. This is what the wait-ioctl does.
4615          *
4616          */
4617 retry:
4618         seq = raw_read_seqcount(&obj->resv->seq);
4619
4620         /* Translate the exclusive fence to the READ *and* WRITE engine */
4621         args->busy = busy_check_writer(rcu_dereference(obj->resv->fence_excl));
4622
4623         /* Translate shared fences to READ set of engines */
4624         list = rcu_dereference(obj->resv->fence);
4625         if (list) {
4626                 unsigned int shared_count = list->shared_count, i;
4627
4628                 for (i = 0; i < shared_count; ++i) {
4629                         struct dma_fence *fence =
4630                                 rcu_dereference(list->shared[i]);
4631
4632                         args->busy |= busy_check_reader(fence);
4633                 }
4634         }
4635
4636         if (args->busy && read_seqcount_retry(&obj->resv->seq, seq))
4637                 goto retry;
4638
4639         err = 0;
4640 out:
4641         rcu_read_unlock();
4642         return err;
4643 }
4644
4645 int
4646 i915_gem_throttle_ioctl(struct drm_device *dev, void *data,
4647                         struct drm_file *file_priv)
4648 {
4649         return i915_gem_ring_throttle(dev, file_priv);
4650 }
4651
4652 int
4653 i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
4654                        struct drm_file *file_priv)
4655 {
4656         struct drm_i915_private *dev_priv = to_i915(dev);
4657         struct drm_i915_gem_madvise *args = data;
4658         struct drm_i915_gem_object *obj;
4659         int err;
4660
4661         switch (args->madv) {
4662         case I915_MADV_DONTNEED:
4663         case I915_MADV_WILLNEED:
4664             break;
4665         default:
4666             return -EINVAL;
4667         }
4668
4669         obj = i915_gem_object_lookup(file_priv, args->handle);
4670         if (!obj)
4671                 return -ENOENT;
4672
4673         err = mutex_lock_interruptible(&obj->mm.lock);
4674         if (err)
4675                 goto out;
4676
4677         if (i915_gem_object_has_pages(obj) &&
4678             i915_gem_object_is_tiled(obj) &&
4679             dev_priv->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
4680                 if (obj->mm.madv == I915_MADV_WILLNEED) {
4681                         GEM_BUG_ON(!obj->mm.quirked);
4682                         __i915_gem_object_unpin_pages(obj);
4683                         obj->mm.quirked = false;
4684                 }
4685                 if (args->madv == I915_MADV_WILLNEED) {
4686                         GEM_BUG_ON(obj->mm.quirked);
4687                         __i915_gem_object_pin_pages(obj);
4688                         obj->mm.quirked = true;
4689                 }
4690         }
4691
4692         if (obj->mm.madv != __I915_MADV_PURGED)
4693                 obj->mm.madv = args->madv;
4694
4695         /* if the object is no longer attached, discard its backing storage */
4696         if (obj->mm.madv == I915_MADV_DONTNEED &&
4697             !i915_gem_object_has_pages(obj))
4698                 i915_gem_object_truncate(obj);
4699
4700         args->retained = obj->mm.madv != __I915_MADV_PURGED;
4701         mutex_unlock(&obj->mm.lock);
4702
4703 out:
4704         i915_gem_object_put(obj);
4705         return err;
4706 }
4707
4708 static void
4709 frontbuffer_retire(struct i915_gem_active *active, struct i915_request *request)
4710 {
4711         struct drm_i915_gem_object *obj =
4712                 container_of(active, typeof(*obj), frontbuffer_write);
4713
4714         intel_fb_obj_flush(obj, ORIGIN_CS);
4715 }
4716
4717 void i915_gem_object_init(struct drm_i915_gem_object *obj,
4718                           const struct drm_i915_gem_object_ops *ops)
4719 {
4720         mutex_init(&obj->mm.lock);
4721
4722         INIT_LIST_HEAD(&obj->vma_list);
4723         INIT_LIST_HEAD(&obj->lut_list);
4724         INIT_LIST_HEAD(&obj->batch_pool_link);
4725
4726         obj->ops = ops;
4727
4728         reservation_object_init(&obj->__builtin_resv);
4729         obj->resv = &obj->__builtin_resv;
4730
4731         obj->frontbuffer_ggtt_origin = ORIGIN_GTT;
4732         init_request_active(&obj->frontbuffer_write, frontbuffer_retire);
4733
4734         obj->mm.madv = I915_MADV_WILLNEED;
4735         INIT_RADIX_TREE(&obj->mm.get_page.radix, GFP_KERNEL | __GFP_NOWARN);
4736         mutex_init(&obj->mm.get_page.lock);
4737
4738         i915_gem_info_add_obj(to_i915(obj->base.dev), obj->base.size);
4739 }
4740
4741 static const struct drm_i915_gem_object_ops i915_gem_object_ops = {
4742         .flags = I915_GEM_OBJECT_HAS_STRUCT_PAGE |
4743                  I915_GEM_OBJECT_IS_SHRINKABLE,
4744
4745         .get_pages = i915_gem_object_get_pages_gtt,
4746         .put_pages = i915_gem_object_put_pages_gtt,
4747
4748         .pwrite = i915_gem_object_pwrite_gtt,
4749 };
4750
4751 static int i915_gem_object_create_shmem(struct drm_device *dev,
4752                                         struct drm_gem_object *obj,
4753                                         size_t size)
4754 {
4755         struct drm_i915_private *i915 = to_i915(dev);
4756         unsigned long flags = VM_NORESERVE;
4757         struct file *filp;
4758
4759         drm_gem_private_object_init(dev, obj, size);
4760
4761         if (i915->mm.gemfs)
4762                 filp = shmem_file_setup_with_mnt(i915->mm.gemfs, "i915", size,
4763                                                  flags);
4764         else
4765                 filp = shmem_file_setup("i915", size, flags);
4766
4767         if (IS_ERR(filp))
4768                 return PTR_ERR(filp);
4769
4770         obj->filp = filp;
4771
4772         return 0;
4773 }
4774
4775 struct drm_i915_gem_object *
4776 i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
4777 {
4778         struct drm_i915_gem_object *obj;
4779         struct address_space *mapping;
4780         unsigned int cache_level;
4781         gfp_t mask;
4782         int ret;
4783
4784         /* There is a prevalence of the assumption that we fit the object's
4785          * page count inside a 32bit _signed_ variable. Let's document this and
4786          * catch if we ever need to fix it. In the meantime, if you do spot
4787          * such a local variable, please consider fixing!
4788          */
4789         if (size >> PAGE_SHIFT > INT_MAX)
4790                 return ERR_PTR(-E2BIG);
4791
4792         if (overflows_type(size, obj->base.size))
4793                 return ERR_PTR(-E2BIG);
4794
4795         obj = i915_gem_object_alloc(dev_priv);
4796         if (obj == NULL)
4797                 return ERR_PTR(-ENOMEM);
4798
4799         ret = i915_gem_object_create_shmem(&dev_priv->drm, &obj->base, size);
4800         if (ret)
4801                 goto fail;
4802
4803         mask = GFP_HIGHUSER | __GFP_RECLAIMABLE;
4804         if (IS_I965GM(dev_priv) || IS_I965G(dev_priv)) {
4805                 /* 965gm cannot relocate objects above 4GiB. */
4806                 mask &= ~__GFP_HIGHMEM;
4807                 mask |= __GFP_DMA32;
4808         }
4809
4810         mapping = obj->base.filp->f_mapping;
4811         mapping_set_gfp_mask(mapping, mask);
4812         GEM_BUG_ON(!(mapping_gfp_mask(mapping) & __GFP_RECLAIM));
4813
4814         i915_gem_object_init(obj, &i915_gem_object_ops);
4815
4816         obj->write_domain = I915_GEM_DOMAIN_CPU;
4817         obj->read_domains = I915_GEM_DOMAIN_CPU;
4818
4819         if (HAS_LLC(dev_priv))
4820                 /* On some devices, we can have the GPU use the LLC (the CPU
4821                  * cache) for about a 10% performance improvement
4822                  * compared to uncached.  Graphics requests other than
4823                  * display scanout are coherent with the CPU in
4824                  * accessing this cache.  This means in this mode we
4825                  * don't need to clflush on the CPU side, and on the
4826                  * GPU side we only need to flush internal caches to
4827                  * get data visible to the CPU.
4828                  *
4829                  * However, we maintain the display planes as UC, and so
4830                  * need to rebind when first used as such.
4831                  */
4832                 cache_level = I915_CACHE_LLC;
4833         else
4834                 cache_level = I915_CACHE_NONE;
4835
4836         i915_gem_object_set_cache_coherency(obj, cache_level);
4837
4838         trace_i915_gem_object_create(obj);
4839
4840         return obj;
4841
4842 fail:
4843         i915_gem_object_free(obj);
4844         return ERR_PTR(ret);
4845 }
4846
4847 static bool discard_backing_storage(struct drm_i915_gem_object *obj)
4848 {
4849         /* If we are the last user of the backing storage (be it shmemfs
4850          * pages or stolen etc), we know that the pages are going to be
4851          * immediately released. In this case, we can then skip copying
4852          * back the contents from the GPU.
4853          */
4854
4855         if (obj->mm.madv != I915_MADV_WILLNEED)
4856                 return false;
4857
4858         if (obj->base.filp == NULL)
4859                 return true;
4860
4861         /* At first glance, this looks racy, but then again so would be
4862          * userspace racing mmap against close. However, the first external
4863          * reference to the filp can only be obtained through the
4864          * i915_gem_mmap_ioctl() which safeguards us against the user
4865          * acquiring such a reference whilst we are in the middle of
4866          * freeing the object.
4867          */
4868         return atomic_long_read(&obj->base.filp->f_count) == 1;
4869 }
4870
4871 static void __i915_gem_free_objects(struct drm_i915_private *i915,
4872                                     struct llist_node *freed)
4873 {
4874         struct drm_i915_gem_object *obj, *on;
4875
4876         intel_runtime_pm_get(i915);
4877         llist_for_each_entry_safe(obj, on, freed, freed) {
4878                 struct i915_vma *vma, *vn;
4879
4880                 trace_i915_gem_object_destroy(obj);
4881
4882                 mutex_lock(&i915->drm.struct_mutex);
4883
4884                 GEM_BUG_ON(i915_gem_object_is_active(obj));
4885                 list_for_each_entry_safe(vma, vn,
4886                                          &obj->vma_list, obj_link) {
4887                         GEM_BUG_ON(i915_vma_is_active(vma));
4888                         vma->flags &= ~I915_VMA_PIN_MASK;
4889                         i915_vma_destroy(vma);
4890                 }
4891                 GEM_BUG_ON(!list_empty(&obj->vma_list));
4892                 GEM_BUG_ON(!RB_EMPTY_ROOT(&obj->vma_tree));
4893
4894                 /* This serializes freeing with the shrinker. Since the free
4895                  * is delayed, first by RCU then by the workqueue, we want the
4896                  * shrinker to be able to free pages of unreferenced objects,
4897                  * or else we may oom whilst there are plenty of deferred
4898                  * freed objects.
4899                  */
4900                 if (i915_gem_object_has_pages(obj)) {
4901                         spin_lock(&i915->mm.obj_lock);
4902                         list_del_init(&obj->mm.link);
4903                         spin_unlock(&i915->mm.obj_lock);
4904                 }
4905
4906                 mutex_unlock(&i915->drm.struct_mutex);
4907
4908                 GEM_BUG_ON(obj->bind_count);
4909                 GEM_BUG_ON(obj->userfault_count);
4910                 GEM_BUG_ON(atomic_read(&obj->frontbuffer_bits));
4911                 GEM_BUG_ON(!list_empty(&obj->lut_list));
4912
4913                 if (obj->ops->release)
4914                         obj->ops->release(obj);
4915
4916                 if (WARN_ON(i915_gem_object_has_pinned_pages(obj)))
4917                         atomic_set(&obj->mm.pages_pin_count, 0);
4918                 __i915_gem_object_put_pages(obj, I915_MM_NORMAL);
4919                 GEM_BUG_ON(i915_gem_object_has_pages(obj));
4920
4921                 if (obj->base.import_attach)
4922                         drm_prime_gem_destroy(&obj->base, NULL);
4923
4924                 reservation_object_fini(&obj->__builtin_resv);
4925                 drm_gem_object_release(&obj->base);
4926                 i915_gem_info_remove_obj(i915, obj->base.size);
4927
4928                 kfree(obj->bit_17);
4929                 i915_gem_object_free(obj);
4930
4931                 GEM_BUG_ON(!atomic_read(&i915->mm.free_count));
4932                 atomic_dec(&i915->mm.free_count);
4933
4934                 if (on)
4935                         cond_resched();
4936         }
4937         intel_runtime_pm_put(i915);
4938 }
4939
4940 static void i915_gem_flush_free_objects(struct drm_i915_private *i915)
4941 {
4942         struct llist_node *freed;
4943
4944         /* Free the oldest, most stale object to keep the free_list short */
4945         freed = NULL;
4946         if (!llist_empty(&i915->mm.free_list)) { /* quick test for hotpath */
4947                 /* Only one consumer of llist_del_first() allowed */
4948                 spin_lock(&i915->mm.free_lock);
4949                 freed = llist_del_first(&i915->mm.free_list);
4950                 spin_unlock(&i915->mm.free_lock);
4951         }
4952         if (unlikely(freed)) {
4953                 freed->next = NULL;
4954                 __i915_gem_free_objects(i915, freed);
4955         }
4956 }
4957
4958 static void __i915_gem_free_work(struct work_struct *work)
4959 {
4960         struct drm_i915_private *i915 =
4961                 container_of(work, struct drm_i915_private, mm.free_work);
4962         struct llist_node *freed;
4963
4964         /*
4965          * All file-owned VMA should have been released by this point through
4966          * i915_gem_close_object(), or earlier by i915_gem_context_close().
4967          * However, the object may also be bound into the global GTT (e.g.
4968          * older GPUs without per-process support, or for direct access through
4969          * the GTT either for the user or for scanout). Those VMA still need to
4970          * unbound now.
4971          */
4972
4973         spin_lock(&i915->mm.free_lock);
4974         while ((freed = llist_del_all(&i915->mm.free_list))) {
4975                 spin_unlock(&i915->mm.free_lock);
4976
4977                 __i915_gem_free_objects(i915, freed);
4978                 if (need_resched())
4979                         return;
4980
4981                 spin_lock(&i915->mm.free_lock);
4982         }
4983         spin_unlock(&i915->mm.free_lock);
4984 }
4985
4986 static void __i915_gem_free_object_rcu(struct rcu_head *head)
4987 {
4988         struct drm_i915_gem_object *obj =
4989                 container_of(head, typeof(*obj), rcu);
4990         struct drm_i915_private *i915 = to_i915(obj->base.dev);
4991
4992         /*
4993          * Since we require blocking on struct_mutex to unbind the freed
4994          * object from the GPU before releasing resources back to the
4995          * system, we can not do that directly from the RCU callback (which may
4996          * be a softirq context), but must instead then defer that work onto a
4997          * kthread. We use the RCU callback rather than move the freed object
4998          * directly onto the work queue so that we can mix between using the
4999          * worker and performing frees directly from subsequent allocations for
5000          * crude but effective memory throttling.
5001          */
5002         if (llist_add(&obj->freed, &i915->mm.free_list))
5003                 queue_work(i915->wq, &i915->mm.free_work);
5004 }
5005
5006 void i915_gem_free_object(struct drm_gem_object *gem_obj)
5007 {
5008         struct drm_i915_gem_object *obj = to_intel_bo(gem_obj);
5009
5010         if (obj->mm.quirked)
5011                 __i915_gem_object_unpin_pages(obj);
5012
5013         if (discard_backing_storage(obj))
5014                 obj->mm.madv = I915_MADV_DONTNEED;
5015
5016         /*
5017          * Before we free the object, make sure any pure RCU-only
5018          * read-side critical sections are complete, e.g.
5019          * i915_gem_busy_ioctl(). For the corresponding synchronized
5020          * lookup see i915_gem_object_lookup_rcu().
5021          */
5022         atomic_inc(&to_i915(obj->base.dev)->mm.free_count);
5023         call_rcu(&obj->rcu, __i915_gem_free_object_rcu);
5024 }
5025
5026 void __i915_gem_object_release_unless_active(struct drm_i915_gem_object *obj)
5027 {
5028         lockdep_assert_held(&obj->base.dev->struct_mutex);
5029
5030         if (!i915_gem_object_has_active_reference(obj) &&
5031             i915_gem_object_is_active(obj))
5032                 i915_gem_object_set_active_reference(obj);
5033         else
5034                 i915_gem_object_put(obj);
5035 }
5036
5037 void i915_gem_sanitize(struct drm_i915_private *i915)
5038 {
5039         int err;
5040
5041         GEM_TRACE("\n");
5042
5043         mutex_lock(&i915->drm.struct_mutex);
5044
5045         intel_runtime_pm_get(i915);
5046         intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
5047
5048         /*
5049          * As we have just resumed the machine and woken the device up from
5050          * deep PCI sleep (presumably D3_cold), assume the HW has been reset
5051          * back to defaults, recovering from whatever wedged state we left it
5052          * in and so worth trying to use the device once more.
5053          */
5054         if (i915_terminally_wedged(&i915->gpu_error))
5055                 i915_gem_unset_wedged(i915);
5056
5057         /*
5058          * If we inherit context state from the BIOS or earlier occupants
5059          * of the GPU, the GPU may be in an inconsistent state when we
5060          * try to take over. The only way to remove the earlier state
5061          * is by resetting. However, resetting on earlier gen is tricky as
5062          * it may impact the display and we are uncertain about the stability
5063          * of the reset, so this could be applied to even earlier gen.
5064          */
5065         err = -ENODEV;
5066         if (INTEL_GEN(i915) >= 5 && intel_has_gpu_reset(i915))
5067                 err = WARN_ON(intel_gpu_reset(i915, ALL_ENGINES));
5068         if (!err)
5069                 intel_engines_sanitize(i915);
5070
5071         intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
5072         intel_runtime_pm_put(i915);
5073
5074         i915_gem_contexts_lost(i915);
5075         mutex_unlock(&i915->drm.struct_mutex);
5076 }
5077
5078 int i915_gem_suspend(struct drm_i915_private *i915)
5079 {
5080         int ret;
5081
5082         GEM_TRACE("\n");
5083
5084         intel_runtime_pm_get(i915);
5085         intel_suspend_gt_powersave(i915);
5086
5087         mutex_lock(&i915->drm.struct_mutex);
5088
5089         /*
5090          * We have to flush all the executing contexts to main memory so
5091          * that they can saved in the hibernation image. To ensure the last
5092          * context image is coherent, we have to switch away from it. That
5093          * leaves the i915->kernel_context still active when
5094          * we actually suspend, and its image in memory may not match the GPU
5095          * state. Fortunately, the kernel_context is disposable and we do
5096          * not rely on its state.
5097          */
5098         if (!i915_terminally_wedged(&i915->gpu_error)) {
5099                 ret = i915_gem_switch_to_kernel_context(i915);
5100                 if (ret)
5101                         goto err_unlock;
5102
5103                 ret = i915_gem_wait_for_idle(i915,
5104                                              I915_WAIT_INTERRUPTIBLE |
5105                                              I915_WAIT_LOCKED |
5106                                              I915_WAIT_FOR_IDLE_BOOST,
5107                                              MAX_SCHEDULE_TIMEOUT);
5108                 if (ret && ret != -EIO)
5109                         goto err_unlock;
5110
5111                 assert_kernel_context_is_current(i915);
5112         }
5113         i915_retire_requests(i915); /* ensure we flush after wedging */
5114
5115         mutex_unlock(&i915->drm.struct_mutex);
5116
5117         intel_uc_suspend(i915);
5118
5119         cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
5120         cancel_delayed_work_sync(&i915->gt.retire_work);
5121
5122         /*
5123          * As the idle_work is rearming if it detects a race, play safe and
5124          * repeat the flush until it is definitely idle.
5125          */
5126         drain_delayed_work(&i915->gt.idle_work);
5127
5128         /*
5129          * Assert that we successfully flushed all the work and
5130          * reset the GPU back to its idle, low power state.
5131          */
5132         WARN_ON(i915->gt.awake);
5133         if (WARN_ON(!intel_engines_are_idle(i915)))
5134                 i915_gem_set_wedged(i915); /* no hope, discard everything */
5135
5136         intel_runtime_pm_put(i915);
5137         return 0;
5138
5139 err_unlock:
5140         mutex_unlock(&i915->drm.struct_mutex);
5141         intel_runtime_pm_put(i915);
5142         return ret;
5143 }
5144
5145 void i915_gem_suspend_late(struct drm_i915_private *i915)
5146 {
5147         struct drm_i915_gem_object *obj;
5148         struct list_head *phases[] = {
5149                 &i915->mm.unbound_list,
5150                 &i915->mm.bound_list,
5151                 NULL
5152         }, **phase;
5153
5154         /*
5155          * Neither the BIOS, ourselves or any other kernel
5156          * expects the system to be in execlists mode on startup,
5157          * so we need to reset the GPU back to legacy mode. And the only
5158          * known way to disable logical contexts is through a GPU reset.
5159          *
5160          * So in order to leave the system in a known default configuration,
5161          * always reset the GPU upon unload and suspend. Afterwards we then
5162          * clean up the GEM state tracking, flushing off the requests and
5163          * leaving the system in a known idle state.
5164          *
5165          * Note that is of the upmost importance that the GPU is idle and
5166          * all stray writes are flushed *before* we dismantle the backing
5167          * storage for the pinned objects.
5168          *
5169          * However, since we are uncertain that resetting the GPU on older
5170          * machines is a good idea, we don't - just in case it leaves the
5171          * machine in an unusable condition.
5172          */
5173
5174         mutex_lock(&i915->drm.struct_mutex);
5175         for (phase = phases; *phase; phase++) {
5176                 list_for_each_entry(obj, *phase, mm.link)
5177                         WARN_ON(i915_gem_object_set_to_gtt_domain(obj, false));
5178         }
5179         mutex_unlock(&i915->drm.struct_mutex);
5180
5181         intel_uc_sanitize(i915);
5182         i915_gem_sanitize(i915);
5183 }
5184
5185 void i915_gem_resume(struct drm_i915_private *i915)
5186 {
5187         GEM_TRACE("\n");
5188
5189         WARN_ON(i915->gt.awake);
5190
5191         mutex_lock(&i915->drm.struct_mutex);
5192         intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
5193
5194         i915_gem_restore_gtt_mappings(i915);
5195         i915_gem_restore_fences(i915);
5196
5197         /*
5198          * As we didn't flush the kernel context before suspend, we cannot
5199          * guarantee that the context image is complete. So let's just reset
5200          * it and start again.
5201          */
5202         i915->gt.resume(i915);
5203
5204         if (i915_gem_init_hw(i915))
5205                 goto err_wedged;
5206
5207         intel_uc_resume(i915);
5208
5209         /* Always reload a context for powersaving. */
5210         if (i915_gem_switch_to_kernel_context(i915))
5211                 goto err_wedged;
5212
5213 out_unlock:
5214         intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
5215         mutex_unlock(&i915->drm.struct_mutex);
5216         return;
5217
5218 err_wedged:
5219         if (!i915_terminally_wedged(&i915->gpu_error)) {
5220                 DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
5221                 i915_gem_set_wedged(i915);
5222         }
5223         goto out_unlock;
5224 }
5225
5226 void i915_gem_init_swizzling(struct drm_i915_private *dev_priv)
5227 {
5228         if (INTEL_GEN(dev_priv) < 5 ||
5229             dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE)
5230                 return;
5231
5232         I915_WRITE(DISP_ARB_CTL, I915_READ(DISP_ARB_CTL) |
5233                                  DISP_TILE_SURFACE_SWIZZLING);
5234
5235         if (IS_GEN5(dev_priv))
5236                 return;
5237
5238         I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_SWZCTL);
5239         if (IS_GEN6(dev_priv))
5240                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB));
5241         else if (IS_GEN7(dev_priv))
5242                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB));
5243         else if (IS_GEN8(dev_priv))
5244                 I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_BDW));
5245         else
5246                 BUG();
5247 }
5248
5249 static void init_unused_ring(struct drm_i915_private *dev_priv, u32 base)
5250 {
5251         I915_WRITE(RING_CTL(base), 0);
5252         I915_WRITE(RING_HEAD(base), 0);
5253         I915_WRITE(RING_TAIL(base), 0);
5254         I915_WRITE(RING_START(base), 0);
5255 }
5256
5257 static void init_unused_rings(struct drm_i915_private *dev_priv)
5258 {
5259         if (IS_I830(dev_priv)) {
5260                 init_unused_ring(dev_priv, PRB1_BASE);
5261                 init_unused_ring(dev_priv, SRB0_BASE);
5262                 init_unused_ring(dev_priv, SRB1_BASE);
5263                 init_unused_ring(dev_priv, SRB2_BASE);
5264                 init_unused_ring(dev_priv, SRB3_BASE);
5265         } else if (IS_GEN2(dev_priv)) {
5266                 init_unused_ring(dev_priv, SRB0_BASE);
5267                 init_unused_ring(dev_priv, SRB1_BASE);
5268         } else if (IS_GEN3(dev_priv)) {
5269                 init_unused_ring(dev_priv, PRB1_BASE);
5270                 init_unused_ring(dev_priv, PRB2_BASE);
5271         }
5272 }
5273
5274 static int __i915_gem_restart_engines(void *data)
5275 {
5276         struct drm_i915_private *i915 = data;
5277         struct intel_engine_cs *engine;
5278         enum intel_engine_id id;
5279         int err;
5280
5281         for_each_engine(engine, i915, id) {
5282                 err = engine->init_hw(engine);
5283                 if (err) {
5284                         DRM_ERROR("Failed to restart %s (%d)\n",
5285                                   engine->name, err);
5286                         return err;
5287                 }
5288         }
5289
5290         return 0;
5291 }
5292
5293 int i915_gem_init_hw(struct drm_i915_private *dev_priv)
5294 {
5295         int ret;
5296
5297         dev_priv->gt.last_init_time = ktime_get();
5298
5299         /* Double layer security blanket, see i915_gem_init() */
5300         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5301
5302         if (HAS_EDRAM(dev_priv) && INTEL_GEN(dev_priv) < 9)
5303                 I915_WRITE(HSW_IDICR, I915_READ(HSW_IDICR) | IDIHASHMSK(0xf));
5304
5305         if (IS_HASWELL(dev_priv))
5306                 I915_WRITE(MI_PREDICATE_RESULT_2, IS_HSW_GT3(dev_priv) ?
5307                            LOWER_SLICE_ENABLED : LOWER_SLICE_DISABLED);
5308
5309         if (HAS_PCH_NOP(dev_priv)) {
5310                 if (IS_IVYBRIDGE(dev_priv)) {
5311                         u32 temp = I915_READ(GEN7_MSG_CTL);
5312                         temp &= ~(WAIT_FOR_PCH_FLR_ACK | WAIT_FOR_PCH_RESET_ACK);
5313                         I915_WRITE(GEN7_MSG_CTL, temp);
5314                 } else if (INTEL_GEN(dev_priv) >= 7) {
5315                         u32 temp = I915_READ(HSW_NDE_RSTWRN_OPT);
5316                         temp &= ~RESET_PCH_HANDSHAKE_ENABLE;
5317                         I915_WRITE(HSW_NDE_RSTWRN_OPT, temp);
5318                 }
5319         }
5320
5321         intel_gt_workarounds_apply(dev_priv);
5322
5323         i915_gem_init_swizzling(dev_priv);
5324
5325         /*
5326          * At least 830 can leave some of the unused rings
5327          * "active" (ie. head != tail) after resume which
5328          * will prevent c3 entry. Makes sure all unused rings
5329          * are totally idle.
5330          */
5331         init_unused_rings(dev_priv);
5332
5333         BUG_ON(!dev_priv->kernel_context);
5334         if (i915_terminally_wedged(&dev_priv->gpu_error)) {
5335                 ret = -EIO;
5336                 goto out;
5337         }
5338
5339         ret = i915_ppgtt_init_hw(dev_priv);
5340         if (ret) {
5341                 DRM_ERROR("Enabling PPGTT failed (%d)\n", ret);
5342                 goto out;
5343         }
5344
5345         ret = intel_wopcm_init_hw(&dev_priv->wopcm);
5346         if (ret) {
5347                 DRM_ERROR("Enabling WOPCM failed (%d)\n", ret);
5348                 goto out;
5349         }
5350
5351         /* We can't enable contexts until all firmware is loaded */
5352         ret = intel_uc_init_hw(dev_priv);
5353         if (ret) {
5354                 DRM_ERROR("Enabling uc failed (%d)\n", ret);
5355                 goto out;
5356         }
5357
5358         intel_mocs_init_l3cc_table(dev_priv);
5359
5360         /* Only when the HW is re-initialised, can we replay the requests */
5361         ret = __i915_gem_restart_engines(dev_priv);
5362         if (ret)
5363                 goto cleanup_uc;
5364
5365         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5366
5367         return 0;
5368
5369 cleanup_uc:
5370         intel_uc_fini_hw(dev_priv);
5371 out:
5372         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5373
5374         return ret;
5375 }
5376
5377 static int __intel_engines_record_defaults(struct drm_i915_private *i915)
5378 {
5379         struct i915_gem_context *ctx;
5380         struct intel_engine_cs *engine;
5381         enum intel_engine_id id;
5382         int err;
5383
5384         /*
5385          * As we reset the gpu during very early sanitisation, the current
5386          * register state on the GPU should reflect its defaults values.
5387          * We load a context onto the hw (with restore-inhibit), then switch
5388          * over to a second context to save that default register state. We
5389          * can then prime every new context with that state so they all start
5390          * from the same default HW values.
5391          */
5392
5393         ctx = i915_gem_context_create_kernel(i915, 0);
5394         if (IS_ERR(ctx))
5395                 return PTR_ERR(ctx);
5396
5397         for_each_engine(engine, i915, id) {
5398                 struct i915_request *rq;
5399
5400                 rq = i915_request_alloc(engine, ctx);
5401                 if (IS_ERR(rq)) {
5402                         err = PTR_ERR(rq);
5403                         goto out_ctx;
5404                 }
5405
5406                 err = 0;
5407                 if (engine->init_context)
5408                         err = engine->init_context(rq);
5409
5410                 i915_request_add(rq);
5411                 if (err)
5412                         goto err_active;
5413         }
5414
5415         err = i915_gem_switch_to_kernel_context(i915);
5416         if (err)
5417                 goto err_active;
5418
5419         if (i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED, HZ / 5)) {
5420                 i915_gem_set_wedged(i915);
5421                 err = -EIO; /* Caller will declare us wedged */
5422                 goto err_active;
5423         }
5424
5425         assert_kernel_context_is_current(i915);
5426
5427         for_each_engine(engine, i915, id) {
5428                 struct i915_vma *state;
5429
5430                 state = to_intel_context(ctx, engine)->state;
5431                 if (!state)
5432                         continue;
5433
5434                 /*
5435                  * As we will hold a reference to the logical state, it will
5436                  * not be torn down with the context, and importantly the
5437                  * object will hold onto its vma (making it possible for a
5438                  * stray GTT write to corrupt our defaults). Unmap the vma
5439                  * from the GTT to prevent such accidents and reclaim the
5440                  * space.
5441                  */
5442                 err = i915_vma_unbind(state);
5443                 if (err)
5444                         goto err_active;
5445
5446                 err = i915_gem_object_set_to_cpu_domain(state->obj, false);
5447                 if (err)
5448                         goto err_active;
5449
5450                 engine->default_state = i915_gem_object_get(state->obj);
5451         }
5452
5453         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
5454                 unsigned int found = intel_engines_has_context_isolation(i915);
5455
5456                 /*
5457                  * Make sure that classes with multiple engine instances all
5458                  * share the same basic configuration.
5459                  */
5460                 for_each_engine(engine, i915, id) {
5461                         unsigned int bit = BIT(engine->uabi_class);
5462                         unsigned int expected = engine->default_state ? bit : 0;
5463
5464                         if ((found & bit) != expected) {
5465                                 DRM_ERROR("mismatching default context state for class %d on engine %s\n",
5466                                           engine->uabi_class, engine->name);
5467                         }
5468                 }
5469         }
5470
5471 out_ctx:
5472         i915_gem_context_set_closed(ctx);
5473         i915_gem_context_put(ctx);
5474         return err;
5475
5476 err_active:
5477         /*
5478          * If we have to abandon now, we expect the engines to be idle
5479          * and ready to be torn-down. First try to flush any remaining
5480          * request, ensure we are pointing at the kernel context and
5481          * then remove it.
5482          */
5483         if (WARN_ON(i915_gem_switch_to_kernel_context(i915)))
5484                 goto out_ctx;
5485
5486         if (WARN_ON(i915_gem_wait_for_idle(i915,
5487                                            I915_WAIT_LOCKED,
5488                                            MAX_SCHEDULE_TIMEOUT)))
5489                 goto out_ctx;
5490
5491         i915_gem_contexts_lost(i915);
5492         goto out_ctx;
5493 }
5494
5495 int i915_gem_init(struct drm_i915_private *dev_priv)
5496 {
5497         int ret;
5498
5499         /* We need to fallback to 4K pages if host doesn't support huge gtt. */
5500         if (intel_vgpu_active(dev_priv) && !intel_vgpu_has_huge_gtt(dev_priv))
5501                 mkwrite_device_info(dev_priv)->page_sizes =
5502                         I915_GTT_PAGE_SIZE_4K;
5503
5504         dev_priv->mm.unordered_timeline = dma_fence_context_alloc(1);
5505
5506         if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
5507                 dev_priv->gt.resume = intel_lr_context_resume;
5508                 dev_priv->gt.cleanup_engine = intel_logical_ring_cleanup;
5509         } else {
5510                 dev_priv->gt.resume = intel_legacy_submission_resume;
5511                 dev_priv->gt.cleanup_engine = intel_engine_cleanup;
5512         }
5513
5514         ret = i915_gem_init_userptr(dev_priv);
5515         if (ret)
5516                 return ret;
5517
5518         ret = intel_uc_init_misc(dev_priv);
5519         if (ret)
5520                 return ret;
5521
5522         ret = intel_wopcm_init(&dev_priv->wopcm);
5523         if (ret)
5524                 goto err_uc_misc;
5525
5526         /* This is just a security blanket to placate dragons.
5527          * On some systems, we very sporadically observe that the first TLBs
5528          * used by the CS may be stale, despite us poking the TLB reset. If
5529          * we hold the forcewake during initialisation these problems
5530          * just magically go away.
5531          */
5532         mutex_lock(&dev_priv->drm.struct_mutex);
5533         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5534
5535         ret = i915_gem_init_ggtt(dev_priv);
5536         if (ret) {
5537                 GEM_BUG_ON(ret == -EIO);
5538                 goto err_unlock;
5539         }
5540
5541         ret = i915_gem_contexts_init(dev_priv);
5542         if (ret) {
5543                 GEM_BUG_ON(ret == -EIO);
5544                 goto err_ggtt;
5545         }
5546
5547         ret = intel_engines_init(dev_priv);
5548         if (ret) {
5549                 GEM_BUG_ON(ret == -EIO);
5550                 goto err_context;
5551         }
5552
5553         intel_init_gt_powersave(dev_priv);
5554
5555         ret = intel_uc_init(dev_priv);
5556         if (ret)
5557                 goto err_pm;
5558
5559         ret = i915_gem_init_hw(dev_priv);
5560         if (ret)
5561                 goto err_uc_init;
5562
5563         /*
5564          * Despite its name intel_init_clock_gating applies both display
5565          * clock gating workarounds; GT mmio workarounds and the occasional
5566          * GT power context workaround. Worse, sometimes it includes a context
5567          * register workaround which we need to apply before we record the
5568          * default HW state for all contexts.
5569          *
5570          * FIXME: break up the workarounds and apply them at the right time!
5571          */
5572         intel_init_clock_gating(dev_priv);
5573
5574         ret = __intel_engines_record_defaults(dev_priv);
5575         if (ret)
5576                 goto err_init_hw;
5577
5578         if (i915_inject_load_failure()) {
5579                 ret = -ENODEV;
5580                 goto err_init_hw;
5581         }
5582
5583         if (i915_inject_load_failure()) {
5584                 ret = -EIO;
5585                 goto err_init_hw;
5586         }
5587
5588         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5589         mutex_unlock(&dev_priv->drm.struct_mutex);
5590
5591         return 0;
5592
5593         /*
5594          * Unwinding is complicated by that we want to handle -EIO to mean
5595          * disable GPU submission but keep KMS alive. We want to mark the
5596          * HW as irrevisibly wedged, but keep enough state around that the
5597          * driver doesn't explode during runtime.
5598          */
5599 err_init_hw:
5600         mutex_unlock(&dev_priv->drm.struct_mutex);
5601
5602         WARN_ON(i915_gem_suspend(dev_priv));
5603         i915_gem_suspend_late(dev_priv);
5604
5605         i915_gem_drain_workqueue(dev_priv);
5606
5607         mutex_lock(&dev_priv->drm.struct_mutex);
5608         intel_uc_fini_hw(dev_priv);
5609 err_uc_init:
5610         intel_uc_fini(dev_priv);
5611 err_pm:
5612         if (ret != -EIO) {
5613                 intel_cleanup_gt_powersave(dev_priv);
5614                 i915_gem_cleanup_engines(dev_priv);
5615         }
5616 err_context:
5617         if (ret != -EIO)
5618                 i915_gem_contexts_fini(dev_priv);
5619 err_ggtt:
5620 err_unlock:
5621         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5622         mutex_unlock(&dev_priv->drm.struct_mutex);
5623
5624 err_uc_misc:
5625         intel_uc_fini_misc(dev_priv);
5626
5627         if (ret != -EIO)
5628                 i915_gem_cleanup_userptr(dev_priv);
5629
5630         if (ret == -EIO) {
5631                 mutex_lock(&dev_priv->drm.struct_mutex);
5632
5633                 /*
5634                  * Allow engine initialisation to fail by marking the GPU as
5635                  * wedged. But we only want to do this where the GPU is angry,
5636                  * for all other failure, such as an allocation failure, bail.
5637                  */
5638                 if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
5639                         i915_load_error(dev_priv,
5640                                         "Failed to initialize GPU, declaring it wedged!\n");
5641                         i915_gem_set_wedged(dev_priv);
5642                 }
5643
5644                 /* Minimal basic recovery for KMS */
5645                 ret = i915_ggtt_enable_hw(dev_priv);
5646                 i915_gem_restore_gtt_mappings(dev_priv);
5647                 i915_gem_restore_fences(dev_priv);
5648                 intel_init_clock_gating(dev_priv);
5649
5650                 mutex_unlock(&dev_priv->drm.struct_mutex);
5651         }
5652
5653         i915_gem_drain_freed_objects(dev_priv);
5654         return ret;
5655 }
5656
5657 void i915_gem_fini(struct drm_i915_private *dev_priv)
5658 {
5659         i915_gem_suspend_late(dev_priv);
5660         intel_disable_gt_powersave(dev_priv);
5661
5662         /* Flush any outstanding unpin_work. */
5663         i915_gem_drain_workqueue(dev_priv);
5664
5665         mutex_lock(&dev_priv->drm.struct_mutex);
5666         intel_uc_fini_hw(dev_priv);
5667         intel_uc_fini(dev_priv);
5668         i915_gem_cleanup_engines(dev_priv);
5669         i915_gem_contexts_fini(dev_priv);
5670         mutex_unlock(&dev_priv->drm.struct_mutex);
5671
5672         intel_cleanup_gt_powersave(dev_priv);
5673
5674         intel_uc_fini_misc(dev_priv);
5675         i915_gem_cleanup_userptr(dev_priv);
5676
5677         i915_gem_drain_freed_objects(dev_priv);
5678
5679         WARN_ON(!list_empty(&dev_priv->contexts.list));
5680 }
5681
5682 void i915_gem_init_mmio(struct drm_i915_private *i915)
5683 {
5684         i915_gem_sanitize(i915);
5685 }
5686
5687 void
5688 i915_gem_cleanup_engines(struct drm_i915_private *dev_priv)
5689 {
5690         struct intel_engine_cs *engine;
5691         enum intel_engine_id id;
5692
5693         for_each_engine(engine, dev_priv, id)
5694                 dev_priv->gt.cleanup_engine(engine);
5695 }
5696
5697 void
5698 i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
5699 {
5700         int i;
5701
5702         if (INTEL_GEN(dev_priv) >= 7 && !IS_VALLEYVIEW(dev_priv) &&
5703             !IS_CHERRYVIEW(dev_priv))
5704                 dev_priv->num_fence_regs = 32;
5705         else if (INTEL_GEN(dev_priv) >= 4 ||
5706                  IS_I945G(dev_priv) || IS_I945GM(dev_priv) ||
5707                  IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
5708                 dev_priv->num_fence_regs = 16;
5709         else
5710                 dev_priv->num_fence_regs = 8;
5711
5712         if (intel_vgpu_active(dev_priv))
5713                 dev_priv->num_fence_regs =
5714                                 I915_READ(vgtif_reg(avail_rs.fence_num));
5715
5716         /* Initialize fence registers to zero */
5717         for (i = 0; i < dev_priv->num_fence_regs; i++) {
5718                 struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
5719
5720                 fence->i915 = dev_priv;
5721                 fence->id = i;
5722                 list_add_tail(&fence->link, &dev_priv->mm.fence_list);
5723         }
5724         i915_gem_restore_fences(dev_priv);
5725
5726         i915_gem_detect_bit_6_swizzle(dev_priv);
5727 }
5728
5729 static void i915_gem_init__mm(struct drm_i915_private *i915)
5730 {
5731         spin_lock_init(&i915->mm.object_stat_lock);
5732         spin_lock_init(&i915->mm.obj_lock);
5733         spin_lock_init(&i915->mm.free_lock);
5734
5735         init_llist_head(&i915->mm.free_list);
5736
5737         INIT_LIST_HEAD(&i915->mm.unbound_list);
5738         INIT_LIST_HEAD(&i915->mm.bound_list);
5739         INIT_LIST_HEAD(&i915->mm.fence_list);
5740         INIT_LIST_HEAD(&i915->mm.userfault_list);
5741
5742         INIT_WORK(&i915->mm.free_work, __i915_gem_free_work);
5743 }
5744
5745 int i915_gem_init_early(struct drm_i915_private *dev_priv)
5746 {
5747         int err = -ENOMEM;
5748
5749         dev_priv->objects = KMEM_CACHE(drm_i915_gem_object, SLAB_HWCACHE_ALIGN);
5750         if (!dev_priv->objects)
5751                 goto err_out;
5752
5753         dev_priv->vmas = KMEM_CACHE(i915_vma, SLAB_HWCACHE_ALIGN);
5754         if (!dev_priv->vmas)
5755                 goto err_objects;
5756
5757         dev_priv->luts = KMEM_CACHE(i915_lut_handle, 0);
5758         if (!dev_priv->luts)
5759                 goto err_vmas;
5760
5761         dev_priv->requests = KMEM_CACHE(i915_request,
5762                                         SLAB_HWCACHE_ALIGN |
5763                                         SLAB_RECLAIM_ACCOUNT |
5764                                         SLAB_TYPESAFE_BY_RCU);
5765         if (!dev_priv->requests)
5766                 goto err_luts;
5767
5768         dev_priv->dependencies = KMEM_CACHE(i915_dependency,
5769                                             SLAB_HWCACHE_ALIGN |
5770                                             SLAB_RECLAIM_ACCOUNT);
5771         if (!dev_priv->dependencies)
5772                 goto err_requests;
5773
5774         dev_priv->priorities = KMEM_CACHE(i915_priolist, SLAB_HWCACHE_ALIGN);
5775         if (!dev_priv->priorities)
5776                 goto err_dependencies;
5777
5778         INIT_LIST_HEAD(&dev_priv->gt.timelines);
5779         INIT_LIST_HEAD(&dev_priv->gt.active_rings);
5780         INIT_LIST_HEAD(&dev_priv->gt.closed_vma);
5781
5782         i915_gem_init__mm(dev_priv);
5783
5784         INIT_DELAYED_WORK(&dev_priv->gt.retire_work,
5785                           i915_gem_retire_work_handler);
5786         INIT_DELAYED_WORK(&dev_priv->gt.idle_work,
5787                           i915_gem_idle_work_handler);
5788         init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
5789         init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
5790
5791         atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
5792
5793         spin_lock_init(&dev_priv->fb_tracking.lock);
5794
5795         err = i915_gemfs_init(dev_priv);
5796         if (err)
5797                 DRM_NOTE("Unable to create a private tmpfs mount, hugepage support will be disabled(%d).\n", err);
5798
5799         return 0;
5800
5801 err_dependencies:
5802         kmem_cache_destroy(dev_priv->dependencies);
5803 err_requests:
5804         kmem_cache_destroy(dev_priv->requests);
5805 err_luts:
5806         kmem_cache_destroy(dev_priv->luts);
5807 err_vmas:
5808         kmem_cache_destroy(dev_priv->vmas);
5809 err_objects:
5810         kmem_cache_destroy(dev_priv->objects);
5811 err_out:
5812         return err;
5813 }
5814
5815 void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
5816 {
5817         i915_gem_drain_freed_objects(dev_priv);
5818         GEM_BUG_ON(!llist_empty(&dev_priv->mm.free_list));
5819         GEM_BUG_ON(atomic_read(&dev_priv->mm.free_count));
5820         WARN_ON(dev_priv->mm.object_count);
5821         WARN_ON(!list_empty(&dev_priv->gt.timelines));
5822
5823         kmem_cache_destroy(dev_priv->priorities);
5824         kmem_cache_destroy(dev_priv->dependencies);
5825         kmem_cache_destroy(dev_priv->requests);
5826         kmem_cache_destroy(dev_priv->luts);
5827         kmem_cache_destroy(dev_priv->vmas);
5828         kmem_cache_destroy(dev_priv->objects);
5829
5830         /* And ensure that our DESTROY_BY_RCU slabs are truly destroyed */
5831         rcu_barrier();
5832
5833         i915_gemfs_fini(dev_priv);
5834 }
5835
5836 int i915_gem_freeze(struct drm_i915_private *dev_priv)
5837 {
5838         /* Discard all purgeable objects, let userspace recover those as
5839          * required after resuming.
5840          */
5841         i915_gem_shrink_all(dev_priv);
5842
5843         return 0;
5844 }
5845
5846 int i915_gem_freeze_late(struct drm_i915_private *i915)
5847 {
5848         struct drm_i915_gem_object *obj;
5849         struct list_head *phases[] = {
5850                 &i915->mm.unbound_list,
5851                 &i915->mm.bound_list,
5852                 NULL
5853         }, **phase;
5854
5855         /*
5856          * Called just before we write the hibernation image.
5857          *
5858          * We need to update the domain tracking to reflect that the CPU
5859          * will be accessing all the pages to create and restore from the
5860          * hibernation, and so upon restoration those pages will be in the
5861          * CPU domain.
5862          *
5863          * To make sure the hibernation image contains the latest state,
5864          * we update that state just before writing out the image.
5865          *
5866          * To try and reduce the hibernation image, we manually shrink
5867          * the objects as well, see i915_gem_freeze()
5868          */
5869
5870         i915_gem_shrink(i915, -1UL, NULL, I915_SHRINK_UNBOUND);
5871         i915_gem_drain_freed_objects(i915);
5872
5873         mutex_lock(&i915->drm.struct_mutex);
5874         for (phase = phases; *phase; phase++) {
5875                 list_for_each_entry(obj, *phase, mm.link)
5876                         WARN_ON(i915_gem_object_set_to_cpu_domain(obj, true));
5877         }
5878         mutex_unlock(&i915->drm.struct_mutex);
5879
5880         return 0;
5881 }
5882
5883 void i915_gem_release(struct drm_device *dev, struct drm_file *file)
5884 {
5885         struct drm_i915_file_private *file_priv = file->driver_priv;
5886         struct i915_request *request;
5887
5888         /* Clean up our request list when the client is going away, so that
5889          * later retire_requests won't dereference our soon-to-be-gone
5890          * file_priv.
5891          */
5892         spin_lock(&file_priv->mm.lock);
5893         list_for_each_entry(request, &file_priv->mm.request_list, client_link)
5894                 request->file_priv = NULL;
5895         spin_unlock(&file_priv->mm.lock);
5896 }
5897
5898 int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file)
5899 {
5900         struct drm_i915_file_private *file_priv;
5901         int ret;
5902
5903         DRM_DEBUG("\n");
5904
5905         file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL);
5906         if (!file_priv)
5907                 return -ENOMEM;
5908
5909         file->driver_priv = file_priv;
5910         file_priv->dev_priv = i915;
5911         file_priv->file = file;
5912
5913         spin_lock_init(&file_priv->mm.lock);
5914         INIT_LIST_HEAD(&file_priv->mm.request_list);
5915
5916         file_priv->bsd_engine = -1;
5917         file_priv->hang_timestamp = jiffies;
5918
5919         ret = i915_gem_context_open(i915, file);
5920         if (ret)
5921                 kfree(file_priv);
5922
5923         return ret;
5924 }
5925
5926 /**
5927  * i915_gem_track_fb - update frontbuffer tracking
5928  * @old: current GEM buffer for the frontbuffer slots
5929  * @new: new GEM buffer for the frontbuffer slots
5930  * @frontbuffer_bits: bitmask of frontbuffer slots
5931  *
5932  * This updates the frontbuffer tracking bits @frontbuffer_bits by clearing them
5933  * from @old and setting them in @new. Both @old and @new can be NULL.
5934  */
5935 void i915_gem_track_fb(struct drm_i915_gem_object *old,
5936                        struct drm_i915_gem_object *new,
5937                        unsigned frontbuffer_bits)
5938 {
5939         /* Control of individual bits within the mask are guarded by
5940          * the owning plane->mutex, i.e. we can never see concurrent
5941          * manipulation of individual bits. But since the bitfield as a whole
5942          * is updated using RMW, we need to use atomics in order to update
5943          * the bits.
5944          */
5945         BUILD_BUG_ON(INTEL_FRONTBUFFER_BITS_PER_PIPE * I915_MAX_PIPES >
5946                      sizeof(atomic_t) * BITS_PER_BYTE);
5947
5948         if (old) {
5949                 WARN_ON(!(atomic_read(&old->frontbuffer_bits) & frontbuffer_bits));
5950                 atomic_andnot(frontbuffer_bits, &old->frontbuffer_bits);
5951         }
5952
5953         if (new) {
5954                 WARN_ON(atomic_read(&new->frontbuffer_bits) & frontbuffer_bits);
5955                 atomic_or(frontbuffer_bits, &new->frontbuffer_bits);
5956         }
5957 }
5958
5959 /* Allocate a new GEM object and fill it with the supplied data */
5960 struct drm_i915_gem_object *
5961 i915_gem_object_create_from_data(struct drm_i915_private *dev_priv,
5962                                  const void *data, size_t size)
5963 {
5964         struct drm_i915_gem_object *obj;
5965         struct file *file;
5966         size_t offset;
5967         int err;
5968
5969         obj = i915_gem_object_create(dev_priv, round_up(size, PAGE_SIZE));
5970         if (IS_ERR(obj))
5971                 return obj;
5972
5973         GEM_BUG_ON(obj->write_domain != I915_GEM_DOMAIN_CPU);
5974
5975         file = obj->base.filp;
5976         offset = 0;
5977         do {
5978                 unsigned int len = min_t(typeof(size), size, PAGE_SIZE);
5979                 struct page *page;
5980                 void *pgdata, *vaddr;
5981
5982                 err = pagecache_write_begin(file, file->f_mapping,
5983                                             offset, len, 0,
5984                                             &page, &pgdata);
5985                 if (err < 0)
5986                         goto fail;
5987
5988                 vaddr = kmap(page);
5989                 memcpy(vaddr, data, len);
5990                 kunmap(page);
5991
5992                 err = pagecache_write_end(file, file->f_mapping,
5993                                           offset, len, len,
5994                                           page, pgdata);
5995                 if (err < 0)
5996                         goto fail;
5997
5998                 size -= len;
5999                 data += len;
6000                 offset += len;
6001         } while (size);
6002
6003         return obj;
6004
6005 fail:
6006         i915_gem_object_put(obj);
6007         return ERR_PTR(err);
6008 }
6009
6010 struct scatterlist *
6011 i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
6012                        unsigned int n,
6013                        unsigned int *offset)
6014 {
6015         struct i915_gem_object_page_iter *iter = &obj->mm.get_page;
6016         struct scatterlist *sg;
6017         unsigned int idx, count;
6018
6019         might_sleep();
6020         GEM_BUG_ON(n >= obj->base.size >> PAGE_SHIFT);
6021         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
6022
6023         /* As we iterate forward through the sg, we record each entry in a
6024          * radixtree for quick repeated (backwards) lookups. If we have seen
6025          * this index previously, we will have an entry for it.
6026          *
6027          * Initial lookup is O(N), but this is amortized to O(1) for
6028          * sequential page access (where each new request is consecutive
6029          * to the previous one). Repeated lookups are O(lg(obj->base.size)),
6030          * i.e. O(1) with a large constant!
6031          */
6032         if (n < READ_ONCE(iter->sg_idx))
6033                 goto lookup;
6034
6035         mutex_lock(&iter->lock);
6036
6037         /* We prefer to reuse the last sg so that repeated lookup of this
6038          * (or the subsequent) sg are fast - comparing against the last
6039          * sg is faster than going through the radixtree.
6040          */
6041
6042         sg = iter->sg_pos;
6043         idx = iter->sg_idx;
6044         count = __sg_page_count(sg);
6045
6046         while (idx + count <= n) {
6047                 unsigned long exception, i;
6048                 int ret;
6049
6050                 /* If we cannot allocate and insert this entry, or the
6051                  * individual pages from this range, cancel updating the
6052                  * sg_idx so that on this lookup we are forced to linearly
6053                  * scan onwards, but on future lookups we will try the
6054                  * insertion again (in which case we need to be careful of
6055                  * the error return reporting that we have already inserted
6056                  * this index).
6057                  */
6058                 ret = radix_tree_insert(&iter->radix, idx, sg);
6059                 if (ret && ret != -EEXIST)
6060                         goto scan;
6061
6062                 exception =
6063                         RADIX_TREE_EXCEPTIONAL_ENTRY |
6064                         idx << RADIX_TREE_EXCEPTIONAL_SHIFT;
6065                 for (i = 1; i < count; i++) {
6066                         ret = radix_tree_insert(&iter->radix, idx + i,
6067                                                 (void *)exception);
6068                         if (ret && ret != -EEXIST)
6069                                 goto scan;
6070                 }
6071
6072                 idx += count;
6073                 sg = ____sg_next(sg);
6074                 count = __sg_page_count(sg);
6075         }
6076
6077 scan:
6078         iter->sg_pos = sg;
6079         iter->sg_idx = idx;
6080
6081         mutex_unlock(&iter->lock);
6082
6083         if (unlikely(n < idx)) /* insertion completed by another thread */
6084                 goto lookup;
6085
6086         /* In case we failed to insert the entry into the radixtree, we need
6087          * to look beyond the current sg.
6088          */
6089         while (idx + count <= n) {
6090                 idx += count;
6091                 sg = ____sg_next(sg);
6092                 count = __sg_page_count(sg);
6093         }
6094
6095         *offset = n - idx;
6096         return sg;
6097
6098 lookup:
6099         rcu_read_lock();
6100
6101         sg = radix_tree_lookup(&iter->radix, n);
6102         GEM_BUG_ON(!sg);
6103
6104         /* If this index is in the middle of multi-page sg entry,
6105          * the radixtree will contain an exceptional entry that points
6106          * to the start of that range. We will return the pointer to
6107          * the base page and the offset of this page within the
6108          * sg entry's range.
6109          */
6110         *offset = 0;
6111         if (unlikely(radix_tree_exception(sg))) {
6112                 unsigned long base =
6113                         (unsigned long)sg >> RADIX_TREE_EXCEPTIONAL_SHIFT;
6114
6115                 sg = radix_tree_lookup(&iter->radix, base);
6116                 GEM_BUG_ON(!sg);
6117
6118                 *offset = n - base;
6119         }
6120
6121         rcu_read_unlock();
6122
6123         return sg;
6124 }
6125
6126 struct page *
6127 i915_gem_object_get_page(struct drm_i915_gem_object *obj, unsigned int n)
6128 {
6129         struct scatterlist *sg;
6130         unsigned int offset;
6131
6132         GEM_BUG_ON(!i915_gem_object_has_struct_page(obj));
6133
6134         sg = i915_gem_object_get_sg(obj, n, &offset);
6135         return nth_page(sg_page(sg), offset);
6136 }
6137
6138 /* Like i915_gem_object_get_page(), but mark the returned page dirty */
6139 struct page *
6140 i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj,
6141                                unsigned int n)
6142 {
6143         struct page *page;
6144
6145         page = i915_gem_object_get_page(obj, n);
6146         if (!obj->mm.dirty)
6147                 set_page_dirty(page);
6148
6149         return page;
6150 }
6151
6152 dma_addr_t
6153 i915_gem_object_get_dma_address(struct drm_i915_gem_object *obj,
6154                                 unsigned long n)
6155 {
6156         struct scatterlist *sg;
6157         unsigned int offset;
6158
6159         sg = i915_gem_object_get_sg(obj, n, &offset);
6160         return sg_dma_address(sg) + (offset << PAGE_SHIFT);
6161 }
6162
6163 int i915_gem_object_attach_phys(struct drm_i915_gem_object *obj, int align)
6164 {
6165         struct sg_table *pages;
6166         int err;
6167
6168         if (align > obj->base.size)
6169                 return -EINVAL;
6170
6171         if (obj->ops == &i915_gem_phys_ops)
6172                 return 0;
6173
6174         if (obj->ops != &i915_gem_object_ops)
6175                 return -EINVAL;
6176
6177         err = i915_gem_object_unbind(obj);
6178         if (err)
6179                 return err;
6180
6181         mutex_lock(&obj->mm.lock);
6182
6183         if (obj->mm.madv != I915_MADV_WILLNEED) {
6184                 err = -EFAULT;
6185                 goto err_unlock;
6186         }
6187
6188         if (obj->mm.quirked) {
6189                 err = -EFAULT;
6190                 goto err_unlock;
6191         }
6192
6193         if (obj->mm.mapping) {
6194                 err = -EBUSY;
6195                 goto err_unlock;
6196         }
6197
6198         pages = __i915_gem_object_unset_pages(obj);
6199
6200         obj->ops = &i915_gem_phys_ops;
6201
6202         err = ____i915_gem_object_get_pages(obj);
6203         if (err)
6204                 goto err_xfer;
6205
6206         /* Perma-pin (until release) the physical set of pages */
6207         __i915_gem_object_pin_pages(obj);
6208
6209         if (!IS_ERR_OR_NULL(pages))
6210                 i915_gem_object_ops.put_pages(obj, pages);
6211         mutex_unlock(&obj->mm.lock);
6212         return 0;
6213
6214 err_xfer:
6215         obj->ops = &i915_gem_object_ops;
6216         if (!IS_ERR_OR_NULL(pages)) {
6217                 unsigned int sg_page_sizes = i915_sg_page_sizes(pages->sgl);
6218
6219                 __i915_gem_object_set_pages(obj, pages, sg_page_sizes);
6220         }
6221 err_unlock:
6222         mutex_unlock(&obj->mm.lock);
6223         return err;
6224 }
6225
6226 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
6227 #include "selftests/scatterlist.c"
6228 #include "selftests/mock_gem_device.c"
6229 #include "selftests/huge_gem_object.c"
6230 #include "selftests/huge_pages.c"
6231 #include "selftests/i915_gem_object.c"
6232 #include "selftests/i915_gem_coherency.c"
6233 #endif