drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

   1 /*
   2  * Copyright 2008 Advanced Micro Devices, Inc.
   3  * Copyright 2008 Red Hat Inc.
   4  * Copyright 2009 Jerome Glisse.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  22  * OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors: Dave Airlie
  25  *          Alex Deucher
  26  *          Jerome Glisse
  27  */
  28 #include <linux/dma-fence-array.h>
  29 #include <linux/interval_tree_generic.h>
  30 #include <linux/idr.h>
  31 #include <drm/drmP.h>
  32 #include <drm/amdgpu_drm.h>
  33 #include "amdgpu.h"
  34 #include "amdgpu_trace.h"
  35 #include "amdgpu_amdkfd.h"
  36 #include "amdgpu_gmc.h"
  37
  38 /**
  39  * DOC: GPUVM
  40  *
  41  * GPUVM is similar to the legacy gart on older asics, however
  42  * rather than there being a single global gart table
  43  * for the entire GPU, there are multiple VM page tables active
  44  * at any given time.  The VM page tables can contain a mix
  45  * vram pages and system memory pages and system memory pages
  46  * can be mapped as snooped (cached system pages) or unsnooped
  47  * (uncached system pages).
  48  * Each VM has an ID associated with it and there is a page table
  49  * associated with each VMID.  When execting a command buffer,
  50  * the kernel tells the the ring what VMID to use for that command
  51  * buffer.  VMIDs are allocated dynamically as commands are submitted.
  52  * The userspace drivers maintain their own address space and the kernel
  53  * sets up their pages tables accordingly when they submit their
  54  * command buffers and a VMID is assigned.
  55  * Cayman/Trinity support up to 8 active VMs at any given time;
  56  * SI supports 16.
  57  */
  58
  59 #define START(node) ((node)->start)
  60 #define LAST(node) ((node)->last)
  61
  62 INTERVAL_TREE_DEFINE(struct amdgpu_bo_va_mapping, rb, uint64_t, __subtree_last,
  63                      START, LAST, static, amdgpu_vm_it)
  64
  65 #undef START
  66 #undef LAST
  67
  68 /**
  69  * struct amdgpu_pte_update_params - Local structure
  70  *
  71  * Encapsulate some VM table update parameters to reduce
  72  * the number of function parameters
  73  *
  74  */
  75 struct amdgpu_pte_update_params {
  76
  77         /**
  78          * @adev: amdgpu device we do this update for
  79          */
  80         struct amdgpu_device *adev;
  81
  82         /**
  83          * @vm: optional amdgpu_vm we do this update for
  84          */
  85         struct amdgpu_vm *vm;
  86
  87         /**
  88          * @src: address where to copy page table entries from
  89          */
  90         uint64_t src;
  91
  92         /**
  93          * @ib: indirect buffer to fill with commands
  94          */
  95         struct amdgpu_ib *ib;
  96
  97         /**
  98          * @func: Function which actually does the update
  99          */
 100         void (*func)(struct amdgpu_pte_update_params *params,
 101                      struct amdgpu_bo *bo, uint64_t pe,
 102                      uint64_t addr, unsigned count, uint32_t incr,
 103                      uint64_t flags);
 104         /**
 105          * @pages_addr:
 106          *
 107          * DMA addresses to use for mapping, used during VM update by CPU
 108          */
 109         dma_addr_t *pages_addr;
 110
 111         /**
 112          * @kptr:
 113          *
 114          * Kernel pointer of PD/PT BO that needs to be updated,
 115          * used during VM update by CPU
 116          */
 117         void *kptr;
 118 };
 119
 120 /**
 121  * struct amdgpu_prt_cb - Helper to disable partial resident texture feature from a fence callback
 122  */
 123 struct amdgpu_prt_cb {
 124
 125         /**
 126          * @adev: amdgpu device
 127          */
 128         struct amdgpu_device *adev;
 129
 130         /**
 131          * @cb: callback
 132          */
 133         struct dma_fence_cb cb;
 134 };
 135
 136 /**
 137  * amdgpu_vm_bo_base_init - Adds bo to the list of bos associated with the vm
 138  *
 139  * @base: base structure for tracking BO usage in a VM
 140  * @vm: vm to which bo is to be added
 141  * @bo: amdgpu buffer object
 142  *
 143  * Initialize a bo_va_base structure and add it to the appropriate lists
 144  *
 145  */
 146 static void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base,
 147                                    struct amdgpu_vm *vm,
 148                                    struct amdgpu_bo *bo)
 149 {
 150         base->vm = vm;
 151         base->bo = bo;
 152         INIT_LIST_HEAD(&base->bo_list);
 153         INIT_LIST_HEAD(&base->vm_status);
 154
 155         if (!bo)
 156                 return;
 157         list_add_tail(&base->bo_list, &bo->va);
 158
 159         if (bo->tbo.type == ttm_bo_type_kernel)
 160                 list_move(&base->vm_status, &vm->relocated);
 161
 162         if (bo->tbo.resv != vm->root.base.bo->tbo.resv)
 163                 return;
 164
 165         if (bo->preferred_domains &
 166             amdgpu_mem_type_to_domain(bo->tbo.mem.mem_type))
 167                 return;
 168
 169         /*
 170          * we checked all the prerequisites, but it looks like this per vm bo
 171          * is currently evicted. add the bo to the evicted list to make sure it
 172          * is validated on next vm use to avoid fault.
 173          * */
 174         list_move_tail(&base->vm_status, &vm->evicted);
 175         base->moved = true;
 176 }
 177
 178 /**
 179  * amdgpu_vm_level_shift - return the addr shift for each level
 180  *
 181  * @adev: amdgpu_device pointer
 182  * @level: VMPT level
 183  *
 184  * Returns:
 185  * The number of bits the pfn needs to be right shifted for a level.
 186  */
 187 static unsigned amdgpu_vm_level_shift(struct amdgpu_device *adev,
 188                                       unsigned level)
 189 {
 190         unsigned shift = 0xff;
 191
 192         switch (level) {
 193         case AMDGPU_VM_PDB2:
 194         case AMDGPU_VM_PDB1:
 195         case AMDGPU_VM_PDB0:
 196                 shift = 9 * (AMDGPU_VM_PDB0 - level) +
 197                         adev->vm_manager.block_size;
 198                 break;
 199         case AMDGPU_VM_PTB:
 200                 shift = 0;
 201                 break;
 202         default:
 203                 dev_err(adev->dev, "the level%d isn't supported.\n", level);
 204         }
 205
 206         return shift;
 207 }
 208
 209 /**
 210  * amdgpu_vm_num_entries - return the number of entries in a PD/PT
 211  *
 212  * @adev: amdgpu_device pointer
 213  * @level: VMPT level
 214  *
 215  * Returns:
 216  * The number of entries in a page directory or page table.
 217  */
 218 static unsigned amdgpu_vm_num_entries(struct amdgpu_device *adev,
 219                                       unsigned level)
 220 {
 221         unsigned shift = amdgpu_vm_level_shift(adev,
 222                                                adev->vm_manager.root_level);
 223
 224         if (level == adev->vm_manager.root_level)
 225                 /* For the root directory */
 226                 return round_up(adev->vm_manager.max_pfn, 1 << shift) >> shift;
 227         else if (level != AMDGPU_VM_PTB)
 228                 /* Everything in between */
 229                 return 512;
 230         else
 231                 /* For the page tables on the leaves */
 232                 return AMDGPU_VM_PTE_COUNT(adev);
 233 }
 234
 235 /**
 236  * amdgpu_vm_bo_size - returns the size of the BOs in bytes
 237  *
 238  * @adev: amdgpu_device pointer
 239  * @level: VMPT level
 240  *
 241  * Returns:
 242  * The size of the BO for a page directory or page table in bytes.
 243  */
 244 static unsigned amdgpu_vm_bo_size(struct amdgpu_device *adev, unsigned level)
 245 {
 246         return AMDGPU_GPU_PAGE_ALIGN(amdgpu_vm_num_entries(adev, level) * 8);
 247 }
 248
 249 /**
 250  * amdgpu_vm_get_pd_bo - add the VM PD to a validation list
 251  *
 252  * @vm: vm providing the BOs
 253  * @validated: head of validation list
 254  * @entry: entry to add
 255  *
 256  * Add the page directory to the list of BOs to
 257  * validate for command submission.
 258  */
 259 void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm,
 260                          struct list_head *validated,
 261                          struct amdgpu_bo_list_entry *entry)
 262 {
 263         entry->robj = vm->root.base.bo;
 264         entry->priority = 0;
 265         entry->tv.bo = &entry->robj->tbo;
 266         entry->tv.shared = true;
 267         entry->user_pages = NULL;
 268         list_add(&entry->tv.head, validated);
 269 }
 270
 271 /**
 272  * amdgpu_vm_validate_pt_bos - validate the page table BOs
 273  *
 274  * @adev: amdgpu device pointer
 275  * @vm: vm providing the BOs
 276  * @validate: callback to do the validation
 277  * @param: parameter for the validation callback
 278  *
 279  * Validate the page table BOs on command submission if neccessary.
 280  *
 281  * Returns:
 282  * Validation result.
 283  */
 284 int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 285                               int (*validate)(void *p, struct amdgpu_bo *bo),
 286                               void *param)
 287 {
 288         struct ttm_bo_global *glob = adev->mman.bdev.glob;
 289         struct amdgpu_vm_bo_base *bo_base, *tmp;
 290         int r = 0;
 291
 292         list_for_each_entry_safe(bo_base, tmp, &vm->evicted, vm_status) {
 293                 struct amdgpu_bo *bo = bo_base->bo;
 294
 295                 if (bo->parent) {
 296                         r = validate(param, bo);
 297                         if (r)
 298                                 break;
 299
 300                         spin_lock(&glob->lru_lock);
 301                         ttm_bo_move_to_lru_tail(&bo->tbo);
 302                         if (bo->shadow)
 303                                 ttm_bo_move_to_lru_tail(&bo->shadow->tbo);
 304                         spin_unlock(&glob->lru_lock);
 305                 }
 306
 307                 if (bo->tbo.type != ttm_bo_type_kernel) {
 308                         spin_lock(&vm->moved_lock);
 309                         list_move(&bo_base->vm_status, &vm->moved);
 310                         spin_unlock(&vm->moved_lock);
 311                 } else {
 312                         list_move(&bo_base->vm_status, &vm->relocated);
 313                 }
 314         }
 315
 316         spin_lock(&glob->lru_lock);
 317         list_for_each_entry(bo_base, &vm->idle, vm_status) {
 318                 struct amdgpu_bo *bo = bo_base->bo;
 319
 320                 if (!bo->parent)
 321                         continue;
 322
 323                 ttm_bo_move_to_lru_tail(&bo->tbo);
 324                 if (bo->shadow)
 325                         ttm_bo_move_to_lru_tail(&bo->shadow->tbo);
 326         }
 327         spin_unlock(&glob->lru_lock);
 328
 329         return r;
 330 }
 331
 332 /**
 333  * amdgpu_vm_ready - check VM is ready for updates
 334  *
 335  * @vm: VM to check
 336  *
 337  * Check if all VM PDs/PTs are ready for updates
 338  *
 339  * Returns:
 340  * True if eviction list is empty.
 341  */
 342 bool amdgpu_vm_ready(struct amdgpu_vm *vm)
 343 {
 344         return list_empty(&vm->evicted);
 345 }
 346
 347 /**
 348  * amdgpu_vm_clear_bo - initially clear the PDs/PTs
 349  *
 350  * @adev: amdgpu_device pointer
 351  * @vm: VM to clear BO from
 352  * @bo: BO to clear
 353  * @level: level this BO is at
 354  * @pte_support_ats: indicate ATS support from PTE
 355  *
 356  * Root PD needs to be reserved when calling this.
 357  *
 358  * Returns:
 359  * 0 on success, errno otherwise.
 360  */
 361 static int amdgpu_vm_clear_bo(struct amdgpu_device *adev,
 362                               struct amdgpu_vm *vm, struct amdgpu_bo *bo,
 363                               unsigned level, bool pte_support_ats)
 364 {
 365         struct ttm_operation_ctx ctx = { true, false };
 366         struct dma_fence *fence = NULL;
 367         unsigned entries, ats_entries;
 368         struct amdgpu_ring *ring;
 369         struct amdgpu_job *job;
 370         uint64_t addr;
 371         int r;
 372
 373         entries = amdgpu_bo_size(bo) / 8;
 374
 375         if (pte_support_ats) {
 376                 if (level == adev->vm_manager.root_level) {
 377                         ats_entries = amdgpu_vm_level_shift(adev, level);
 378                         ats_entries += AMDGPU_GPU_PAGE_SHIFT;
 379                         ats_entries = AMDGPU_VA_HOLE_START >> ats_entries;
 380                         ats_entries = min(ats_entries, entries);
 381                         entries -= ats_entries;
 382                 } else {
 383                         ats_entries = entries;
 384                         entries = 0;
 385                 }
 386         } else {
 387                 ats_entries = 0;
 388         }
 389
 390         ring = container_of(vm->entity.rq->sched, struct amdgpu_ring, sched);
 391
 392         r = reservation_object_reserve_shared(bo->tbo.resv);
 393         if (r)
 394                 return r;
 395
 396         r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
 397         if (r)
 398                 goto error;
 399
 400         r = amdgpu_job_alloc_with_ib(adev, 64, &job);
 401         if (r)
 402                 goto error;
 403
 404         addr = amdgpu_bo_gpu_offset(bo);
 405         if (ats_entries) {
 406                 uint64_t ats_value;
 407
 408                 ats_value = AMDGPU_PTE_DEFAULT_ATC;
 409                 if (level != AMDGPU_VM_PTB)
 410                         ats_value |= AMDGPU_PDE_PTE;
 411
 412                 amdgpu_vm_set_pte_pde(adev, &job->ibs[0], addr, 0,
 413                                       ats_entries, 0, ats_value);
 414                 addr += ats_entries * 8;
 415         }
 416
 417         if (entries)
 418                 amdgpu_vm_set_pte_pde(adev, &job->ibs[0], addr, 0,
 419                                       entries, 0, 0);
 420
 421         amdgpu_ring_pad_ib(ring, &job->ibs[0]);
 422
 423         WARN_ON(job->ibs[0].length_dw > 64);
 424         r = amdgpu_sync_resv(adev, &job->sync, bo->tbo.resv,
 425                              AMDGPU_FENCE_OWNER_UNDEFINED, false);
 426         if (r)
 427                 goto error_free;
 428
 429         r = amdgpu_job_submit(job, &vm->entity, AMDGPU_FENCE_OWNER_UNDEFINED,
 430                               &fence);
 431         if (r)
 432                 goto error_free;
 433
 434         amdgpu_bo_fence(bo, fence, true);
 435         dma_fence_put(fence);
 436
 437         if (bo->shadow)
 438                 return amdgpu_vm_clear_bo(adev, vm, bo->shadow,
 439                                           level, pte_support_ats);
 440
 441         return 0;
 442
 443 error_free:
 444         amdgpu_job_free(job);
 445
 446 error:
 447         return r;
 448 }
 449
 450 /**
 451  * amdgpu_vm_alloc_levels - allocate the PD/PT levels
 452  *
 453  * @adev: amdgpu_device pointer
 454  * @vm: requested vm
 455  * @parent: parent PT
 456  * @saddr: start of the address range
 457  * @eaddr: end of the address range
 458  * @level: VMPT level
 459  * @ats: indicate ATS support from PTE
 460  *
 461  * Make sure the page directories and page tables are allocated
 462  *
 463  * Returns:
 464  * 0 on success, errno otherwise.
 465  */
 466 static int amdgpu_vm_alloc_levels(struct amdgpu_device *adev,
 467                                   struct amdgpu_vm *vm,
 468                                   struct amdgpu_vm_pt *parent,
 469                                   uint64_t saddr, uint64_t eaddr,
 470                                   unsigned level, bool ats)
 471 {
 472         unsigned shift = amdgpu_vm_level_shift(adev, level);
 473         unsigned pt_idx, from, to;
 474         u64 flags;
 475         int r;
 476
 477         if (!parent->entries) {
 478                 unsigned num_entries = amdgpu_vm_num_entries(adev, level);
 479
 480                 parent->entries = kvmalloc_array(num_entries,
 481                                                    sizeof(struct amdgpu_vm_pt),
 482                                                    GFP_KERNEL | __GFP_ZERO);
 483                 if (!parent->entries)
 484                         return -ENOMEM;
 485                 memset(parent->entries, 0 , sizeof(struct amdgpu_vm_pt));
 486         }
 487
 488         from = saddr >> shift;
 489         to = eaddr >> shift;
 490         if (from >= amdgpu_vm_num_entries(adev, level) ||
 491             to >= amdgpu_vm_num_entries(adev, level))
 492                 return -EINVAL;
 493
 494         ++level;
 495         saddr = saddr & ((1 << shift) - 1);
 496         eaddr = eaddr & ((1 << shift) - 1);
 497
 498         flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
 499         if (vm->root.base.bo->shadow)
 500                 flags |= AMDGPU_GEM_CREATE_SHADOW;
 501         if (vm->use_cpu_for_update)
 502                 flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
 503         else
 504                 flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
 505
 506         /* walk over the address space and allocate the page tables */
 507         for (pt_idx = from; pt_idx <= to; ++pt_idx) {
 508                 struct reservation_object *resv = vm->root.base.bo->tbo.resv;
 509                 struct amdgpu_vm_pt *entry = &parent->entries[pt_idx];
 510                 struct amdgpu_bo *pt;
 511
 512                 if (!entry->base.bo) {
 513                         struct amdgpu_bo_param bp;
 514
 515                         memset(&bp, 0, sizeof(bp));
 516                         bp.size = amdgpu_vm_bo_size(adev, level);
 517                         bp.byte_align = AMDGPU_GPU_PAGE_SIZE;
 518                         bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
 519                         bp.flags = flags;
 520                         bp.type = ttm_bo_type_kernel;
 521                         bp.resv = resv;
 522                         r = amdgpu_bo_create(adev, &bp, &pt);
 523                         if (r)
 524                                 return r;
 525
 526                         r = amdgpu_vm_clear_bo(adev, vm, pt, level, ats);
 527                         if (r) {
 528                                 amdgpu_bo_unref(&pt->shadow);
 529                                 amdgpu_bo_unref(&pt);
 530                                 return r;
 531                         }
 532
 533                         if (vm->use_cpu_for_update) {
 534                                 r = amdgpu_bo_kmap(pt, NULL);
 535                                 if (r) {
 536                                         amdgpu_bo_unref(&pt->shadow);
 537                                         amdgpu_bo_unref(&pt);
 538                                         return r;
 539                                 }
 540                         }
 541
 542                         /* Keep a reference to the root directory to avoid
 543                         * freeing them up in the wrong order.
 544                         */
 545                         pt->parent = amdgpu_bo_ref(parent->base.bo);
 546
 547                         amdgpu_vm_bo_base_init(&entry->base, vm, pt);
 548                 }
 549
 550                 if (level < AMDGPU_VM_PTB) {
 551                         uint64_t sub_saddr = (pt_idx == from) ? saddr : 0;
 552                         uint64_t sub_eaddr = (pt_idx == to) ? eaddr :
 553                                 ((1 << shift) - 1);
 554                         r = amdgpu_vm_alloc_levels(adev, vm, entry, sub_saddr,
 555                                                    sub_eaddr, level, ats);
 556                         if (r)
 557                                 return r;
 558                 }
 559         }
 560
 561         return 0;
 562 }
 563
 564 /**
 565  * amdgpu_vm_alloc_pts - Allocate page tables.
 566  *
 567  * @adev: amdgpu_device pointer
 568  * @vm: VM to allocate page tables for
 569  * @saddr: Start address which needs to be allocated
 570  * @size: Size from start address we need.
 571  *
 572  * Make sure the page tables are allocated.
 573  *
 574  * Returns:
 575  * 0 on success, errno otherwise.
 576  */
 577 int amdgpu_vm_alloc_pts(struct amdgpu_device *adev,
 578                         struct amdgpu_vm *vm,
 579                         uint64_t saddr, uint64_t size)
 580 {
 581         uint64_t eaddr;
 582         bool ats = false;
 583
 584         /* validate the parameters */
 585         if (saddr & AMDGPU_GPU_PAGE_MASK || size & AMDGPU_GPU_PAGE_MASK)
 586                 return -EINVAL;
 587
 588         eaddr = saddr + size - 1;
 589
 590         if (vm->pte_support_ats)
 591                 ats = saddr < AMDGPU_VA_HOLE_START;
 592
 593         saddr /= AMDGPU_GPU_PAGE_SIZE;
 594         eaddr /= AMDGPU_GPU_PAGE_SIZE;
 595
 596         if (eaddr >= adev->vm_manager.max_pfn) {
 597                 dev_err(adev->dev, "va above limit (0x%08llX >= 0x%08llX)\n",
 598                         eaddr, adev->vm_manager.max_pfn);
 599                 return -EINVAL;
 600         }
 601
 602         return amdgpu_vm_alloc_levels(adev, vm, &vm->root, saddr, eaddr,
 603                                       adev->vm_manager.root_level, ats);
 604 }
 605
 606 /**
 607  * amdgpu_vm_check_compute_bug - check whether asic has compute vm bug
 608  *
 609  * @adev: amdgpu_device pointer
 610  */
 611 void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev)
 612 {
 613         const struct amdgpu_ip_block *ip_block;
 614         bool has_compute_vm_bug;
 615         struct amdgpu_ring *ring;
 616         int i;
 617
 618         has_compute_vm_bug = false;
 619
 620         ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX);
 621         if (ip_block) {
 622                 /* Compute has a VM bug for GFX version < 7.
 623                    Compute has a VM bug for GFX 8 MEC firmware version < 673.*/
 624                 if (ip_block->version->major <= 7)
 625                         has_compute_vm_bug = true;
 626                 else if (ip_block->version->major == 8)
 627                         if (adev->gfx.mec_fw_version < 673)
 628                                 has_compute_vm_bug = true;
 629         }
 630
 631         for (i = 0; i < adev->num_rings; i++) {
 632                 ring = adev->rings[i];
 633                 if (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
 634                         /* only compute rings */
 635                         ring->has_compute_vm_bug = has_compute_vm_bug;
 636                 else
 637                         ring->has_compute_vm_bug = false;
 638         }
 639 }
 640
 641 /**
 642  * amdgpu_vm_need_pipeline_sync - Check if pipe sync is needed for job.
 643  *
 644  * @ring: ring on which the job will be submitted
 645  * @job: job to submit
 646  *
 647  * Returns:
 648  * True if sync is needed.
 649  */
 650 bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
 651                                   struct amdgpu_job *job)
 652 {
 653         struct amdgpu_device *adev = ring->adev;
 654         unsigned vmhub = ring->funcs->vmhub;
 655         struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub];
 656         struct amdgpu_vmid *id;
 657         bool gds_switch_needed;
 658         bool vm_flush_needed = job->vm_needs_flush || ring->has_compute_vm_bug;
 659
 660         if (job->vmid == 0)
 661                 return false;
 662         id = &id_mgr->ids[job->vmid];
 663         gds_switch_needed = ring->funcs->emit_gds_switch && (
 664                 id->gds_base != job->gds_base ||
 665                 id->gds_size != job->gds_size ||
 666                 id->gws_base != job->gws_base ||
 667                 id->gws_size != job->gws_size ||
 668                 id->oa_base != job->oa_base ||
 669                 id->oa_size != job->oa_size);
 670
 671         if (amdgpu_vmid_had_gpu_reset(adev, id))
 672                 return true;
 673
 674         return vm_flush_needed || gds_switch_needed;
 675 }
 676
 677 /**
 678  * amdgpu_vm_flush - hardware flush the vm
 679  *
 680  * @ring: ring to use for flush
 681  * @job:  related job
 682  * @need_pipe_sync: is pipe sync needed
 683  *
 684  * Emit a VM flush when it is necessary.
 685  *
 686  * Returns:
 687  * 0 on success, errno otherwise.
 688  */
 689 int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool need_pipe_sync)
 690 {
 691         struct amdgpu_device *adev = ring->adev;
 692         unsigned vmhub = ring->funcs->vmhub;
 693         struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub];
 694         struct amdgpu_vmid *id = &id_mgr->ids[job->vmid];
 695         bool gds_switch_needed = ring->funcs->emit_gds_switch && (
 696                 id->gds_base != job->gds_base ||
 697                 id->gds_size != job->gds_size ||
 698                 id->gws_base != job->gws_base ||
 699                 id->gws_size != job->gws_size ||
 700                 id->oa_base != job->oa_base ||
 701                 id->oa_size != job->oa_size);
 702         bool vm_flush_needed = job->vm_needs_flush;
 703         struct dma_fence *fence = NULL;
 704         bool pasid_mapping_needed = false;
 705         unsigned patch_offset = 0;
 706         int r;
 707
 708         if (amdgpu_vmid_had_gpu_reset(adev, id)) {
 709                 gds_switch_needed = true;
 710                 vm_flush_needed = true;
 711                 pasid_mapping_needed = true;
 712         }
 713
 714         mutex_lock(&id_mgr->lock);
 715         if (id->pasid != job->pasid || !id->pasid_mapping ||
 716             !dma_fence_is_signaled(id->pasid_mapping))
 717                 pasid_mapping_needed = true;
 718         mutex_unlock(&id_mgr->lock);
 719
 720         gds_switch_needed &= !!ring->funcs->emit_gds_switch;
 721         vm_flush_needed &= !!ring->funcs->emit_vm_flush  &&
 722                         job->vm_pd_addr != AMDGPU_BO_INVALID_OFFSET;
 723         pasid_mapping_needed &= adev->gmc.gmc_funcs->emit_pasid_mapping &&
 724                 ring->funcs->emit_wreg;
 725
 726         if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
 727                 return 0;
 728
 729         if (ring->funcs->init_cond_exec)
 730                 patch_offset = amdgpu_ring_init_cond_exec(ring);
 731
 732         if (need_pipe_sync)
 733                 amdgpu_ring_emit_pipeline_sync(ring);
 734
 735         if (vm_flush_needed) {
 736                 trace_amdgpu_vm_flush(ring, job->vmid, job->vm_pd_addr);
 737                 amdgpu_ring_emit_vm_flush(ring, job->vmid, job->vm_pd_addr);
 738         }
 739
 740         if (pasid_mapping_needed)
 741                 amdgpu_gmc_emit_pasid_mapping(ring, job->vmid, job->pasid);
 742
 743         if (vm_flush_needed || pasid_mapping_needed) {
 744                 r = amdgpu_fence_emit(ring, &fence, 0);
 745                 if (r)
 746                         return r;
 747         }
 748
 749         if (vm_flush_needed) {
 750                 mutex_lock(&id_mgr->lock);
 751                 dma_fence_put(id->last_flush);
 752                 id->last_flush = dma_fence_get(fence);
 753                 id->current_gpu_reset_count =
 754                         atomic_read(&adev->gpu_reset_counter);
 755                 mutex_unlock(&id_mgr->lock);
 756         }
 757
 758         if (pasid_mapping_needed) {
 759                 mutex_lock(&id_mgr->lock);
 760                 id->pasid = job->pasid;
 761                 dma_fence_put(id->pasid_mapping);
 762                 id->pasid_mapping = dma_fence_get(fence);
 763                 mutex_unlock(&id_mgr->lock);
 764         }
 765         dma_fence_put(fence);
 766
 767         if (ring->funcs->emit_gds_switch && gds_switch_needed) {
 768                 id->gds_base = job->gds_base;
 769                 id->gds_size = job->gds_size;
 770                 id->gws_base = job->gws_base;
 771                 id->gws_size = job->gws_size;
 772                 id->oa_base = job->oa_base;
 773                 id->oa_size = job->oa_size;
 774                 amdgpu_ring_emit_gds_switch(ring, job->vmid, job->gds_base,
 775                                             job->gds_size, job->gws_base,
 776                                             job->gws_size, job->oa_base,
 777                                             job->oa_size);
 778         }
 779
 780         if (ring->funcs->patch_cond_exec)
 781                 amdgpu_ring_patch_cond_exec(ring, patch_offset);
 782
 783         /* the double SWITCH_BUFFER here *cannot* be skipped by COND_EXEC */
 784         if (ring->funcs->emit_switch_buffer) {
 785                 amdgpu_ring_emit_switch_buffer(ring);
 786                 amdgpu_ring_emit_switch_buffer(ring);
 787         }
 788         return 0;
 789 }
 790
 791 /**
 792  * amdgpu_vm_bo_find - find the bo_va for a specific vm & bo
 793  *
 794  * @vm: requested vm
 795  * @bo: requested buffer object
 796  *
 797  * Find @bo inside the requested vm.
 798  * Search inside the @bos vm list for the requested vm
 799  * Returns the found bo_va or NULL if none is found
 800  *
 801  * Object has to be reserved!
 802  *
 803  * Returns:
 804  * Found bo_va or NULL.
 805  */
 806 struct amdgpu_bo_va *amdgpu_vm_bo_find(struct amdgpu_vm *vm,
 807                                        struct amdgpu_bo *bo)
 808 {
 809         struct amdgpu_bo_va *bo_va;
 810
 811         list_for_each_entry(bo_va, &bo->va, base.bo_list) {
 812                 if (bo_va->base.vm == vm) {
 813                         return bo_va;
 814                 }
 815         }
 816         return NULL;
 817 }
 818
 819 /**
 820  * amdgpu_vm_do_set_ptes - helper to call the right asic function
 821  *
 822  * @params: see amdgpu_pte_update_params definition
 823  * @bo: PD/PT to update
 824  * @pe: addr of the page entry
 825  * @addr: dst addr to write into pe
 826  * @count: number of page entries to update
 827  * @incr: increase next addr by incr bytes
 828  * @flags: hw access flags
 829  *
 830  * Traces the parameters and calls the right asic functions
 831  * to setup the page table using the DMA.
 832  */
 833 static void amdgpu_vm_do_set_ptes(struct amdgpu_pte_update_params *params,
 834                                   struct amdgpu_bo *bo,
 835                                   uint64_t pe, uint64_t addr,
 836                                   unsigned count, uint32_t incr,
 837                                   uint64_t flags)
 838 {
 839         pe += amdgpu_bo_gpu_offset(bo);
 840         trace_amdgpu_vm_set_ptes(pe, addr, count, incr, flags);
 841
 842         if (count < 3) {
 843                 amdgpu_vm_write_pte(params->adev, params->ib, pe,
 844                                     addr | flags, count, incr);
 845
 846         } else {
 847                 amdgpu_vm_set_pte_pde(params->adev, params->ib, pe, addr,
 848                                       count, incr, flags);
 849         }
 850 }
 851
 852 /**
 853  * amdgpu_vm_do_copy_ptes - copy the PTEs from the GART
 854  *
 855  * @params: see amdgpu_pte_update_params definition
 856  * @bo: PD/PT to update
 857  * @pe: addr of the page entry
 858  * @addr: dst addr to write into pe
 859  * @count: number of page entries to update
 860  * @incr: increase next addr by incr bytes
 861  * @flags: hw access flags
 862  *
 863  * Traces the parameters and calls the DMA function to copy the PTEs.
 864  */
 865 static void amdgpu_vm_do_copy_ptes(struct amdgpu_pte_update_params *params,
 866                                    struct amdgpu_bo *bo,
 867                                    uint64_t pe, uint64_t addr,
 868                                    unsigned count, uint32_t incr,
 869                                    uint64_t flags)
 870 {
 871         uint64_t src = (params->src + (addr >> 12) * 8);
 872
 873         pe += amdgpu_bo_gpu_offset(bo);
 874         trace_amdgpu_vm_copy_ptes(pe, src, count);
 875
 876         amdgpu_vm_copy_pte(params->adev, params->ib, pe, src, count);
 877 }
 878
 879 /**
 880  * amdgpu_vm_map_gart - Resolve gart mapping of addr
 881  *
 882  * @pages_addr: optional DMA address to use for lookup
 883  * @addr: the unmapped addr
 884  *
 885  * Look up the physical address of the page that the pte resolves
 886  * to.
 887  *
 888  * Returns:
 889  * The pointer for the page table entry.
 890  */
 891 static uint64_t amdgpu_vm_map_gart(const dma_addr_t *pages_addr, uint64_t addr)
 892 {
 893         uint64_t result;
 894
 895         /* page table offset */
 896         result = pages_addr[addr >> PAGE_SHIFT];
 897
 898         /* in case cpu page size != gpu page size*/
 899         result |= addr & (~PAGE_MASK);
 900
 901         result &= 0xFFFFFFFFFFFFF000ULL;
 902
 903         return result;
 904 }
 905
 906 /**
 907  * amdgpu_vm_cpu_set_ptes - helper to update page tables via CPU
 908  *
 909  * @params: see amdgpu_pte_update_params definition
 910  * @bo: PD/PT to update
 911  * @pe: kmap addr of the page entry
 912  * @addr: dst addr to write into pe
 913  * @count: number of page entries to update
 914  * @incr: increase next addr by incr bytes
 915  * @flags: hw access flags
 916  *
 917  * Write count number of PT/PD entries directly.
 918  */
 919 static void amdgpu_vm_cpu_set_ptes(struct amdgpu_pte_update_params *params,
 920                                    struct amdgpu_bo *bo,
 921                                    uint64_t pe, uint64_t addr,
 922                                    unsigned count, uint32_t incr,
 923                                    uint64_t flags)
 924 {
 925         unsigned int i;
 926         uint64_t value;
 927
 928         pe += (unsigned long)amdgpu_bo_kptr(bo);
 929
 930         trace_amdgpu_vm_set_ptes(pe, addr, count, incr, flags);
 931
 932         for (i = 0; i < count; i++) {
 933                 value = params->pages_addr ?
 934                         amdgpu_vm_map_gart(params->pages_addr, addr) :
 935                         addr;
 936                 amdgpu_gmc_set_pte_pde(params->adev, (void *)(uintptr_t)pe,
 937                                        i, value, flags);
 938                 addr += incr;
 939         }
 940 }
 941
 942
 943 /**
 944  * amdgpu_vm_wait_pd - Wait for PT BOs to be free.
 945  *
 946  * @adev: amdgpu_device pointer
 947  * @vm: related vm
 948  * @owner: fence owner
 949  *
 950  * Returns:
 951  * 0 on success, errno otherwise.
 952  */
 953 static int amdgpu_vm_wait_pd(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 954                              void *owner)
 955 {
 956         struct amdgpu_sync sync;
 957         int r;
 958
 959         amdgpu_sync_create(&sync);
 960         amdgpu_sync_resv(adev, &sync, vm->root.base.bo->tbo.resv, owner, false);
 961         r = amdgpu_sync_wait(&sync, true);
 962         amdgpu_sync_free(&sync);
 963
 964         return r;
 965 }
 966
 967 /*
 968  * amdgpu_vm_update_pde - update a single level in the hierarchy
 969  *
 970  * @param: parameters for the update
 971  * @vm: requested vm
 972  * @parent: parent directory
 973  * @entry: entry to update
 974  *
 975  * Makes sure the requested entry in parent is up to date.
 976  */
 977 static void amdgpu_vm_update_pde(struct amdgpu_pte_update_params *params,
 978                                  struct amdgpu_vm *vm,
 979                                  struct amdgpu_vm_pt *parent,
 980                                  struct amdgpu_vm_pt *entry)
 981 {
 982         struct amdgpu_bo *bo = parent->base.bo, *pbo;
 983         uint64_t pde, pt, flags;
 984         unsigned level;
 985
 986         /* Don't update huge pages here */
 987         if (entry->huge)
 988                 return;
 989
 990         for (level = 0, pbo = bo->parent; pbo; ++level)
 991                 pbo = pbo->parent;
 992
 993         level += params->adev->vm_manager.root_level;
 994         pt = amdgpu_bo_gpu_offset(entry->base.bo);
 995         flags = AMDGPU_PTE_VALID;
 996         amdgpu_gmc_get_vm_pde(params->adev, level, &pt, &flags);
 997         pde = (entry - parent->entries) * 8;
 998         if (bo->shadow)
 999                 params->func(params, bo->shadow, pde, pt, 1, 0, flags);
1000         params->func(params, bo, pde, pt, 1, 0, flags);
1001 }
1002
1003 /*
1004  * amdgpu_vm_invalidate_level - mark all PD levels as invalid
1005  *
1006  * @adev: amdgpu_device pointer
1007  * @vm: related vm
1008  * @parent: parent PD
1009  * @level: VMPT level
1010  *
1011  * Mark all PD level as invalid after an error.
1012  */
1013 static void amdgpu_vm_invalidate_level(struct amdgpu_device *adev,
1014                                        struct amdgpu_vm *vm,
1015                                        struct amdgpu_vm_pt *parent,
1016                                        unsigned level)
1017 {
1018         unsigned pt_idx, num_entries;
1019
1020         /*
1021          * Recurse into the subdirectories. This recursion is harmless because
1022          * we only have a maximum of 5 layers.
1023          */
1024         num_entries = amdgpu_vm_num_entries(adev, level);
1025         for (pt_idx = 0; pt_idx < num_entries; ++pt_idx) {
1026                 struct amdgpu_vm_pt *entry = &parent->entries[pt_idx];
1027
1028                 if (!entry->base.bo)
1029                         continue;
1030
1031                 if (!entry->base.moved)
1032                         list_move(&entry->base.vm_status, &vm->relocated);
1033                 amdgpu_vm_invalidate_level(adev, vm, entry, level + 1);
1034         }
1035 }
1036
1037 /*
1038  * amdgpu_vm_update_directories - make sure that all directories are valid
1039  *
1040  * @adev: amdgpu_device pointer
1041  * @vm: requested vm
1042  *
1043  * Makes sure all directories are up to date.
1044  *
1045  * Returns:
1046  * 0 for success, error for failure.
1047  */
1048 int amdgpu_vm_update_directories(struct amdgpu_device *adev,
1049                                  struct amdgpu_vm *vm)
1050 {
1051         struct amdgpu_pte_update_params params;
1052         struct amdgpu_job *job;
1053         unsigned ndw = 0;
1054         int r = 0;
1055
1056         if (list_empty(&vm->relocated))
1057                 return 0;
1058
1059 restart:
1060         memset(&params, 0, sizeof(params));
1061         params.adev = adev;
1062
1063         if (vm->use_cpu_for_update) {
1064                 struct amdgpu_vm_bo_base *bo_base;
1065
1066                 list_for_each_entry(bo_base, &vm->relocated, vm_status) {
1067                         r = amdgpu_bo_kmap(bo_base->bo, NULL);
1068                         if (unlikely(r))
1069                                 return r;
1070                 }
1071
1072                 r = amdgpu_vm_wait_pd(adev, vm, AMDGPU_FENCE_OWNER_VM);
1073                 if (unlikely(r))
1074                         return r;
1075
1076                 params.func = amdgpu_vm_cpu_set_ptes;
1077         } else {
1078                 ndw = 512 * 8;
1079                 r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
1080                 if (r)
1081                         return r;
1082
1083                 params.ib = &job->ibs[0];
1084                 params.func = amdgpu_vm_do_set_ptes;
1085         }
1086
1087         while (!list_empty(&vm->relocated)) {
1088                 struct amdgpu_vm_bo_base *bo_base, *parent;
1089                 struct amdgpu_vm_pt *pt, *entry;
1090                 struct amdgpu_bo *bo;
1091
1092                 bo_base = list_first_entry(&vm->relocated,
1093                                            struct amdgpu_vm_bo_base,
1094                                            vm_status);
1095                 bo_base->moved = false;
1096                 list_del_init(&bo_base->vm_status);
1097
1098                 bo = bo_base->bo->parent;
1099                 if (!bo)
1100                         continue;
1101
1102                 parent = list_first_entry(&bo->va, struct amdgpu_vm_bo_base,
1103                                           bo_list);
1104                 pt = container_of(parent, struct amdgpu_vm_pt, base);
1105                 entry = container_of(bo_base, struct amdgpu_vm_pt, base);
1106
1107                 amdgpu_vm_update_pde(&params, vm, pt, entry);
1108
1109                 if (!vm->use_cpu_for_update &&
1110                     (ndw - params.ib->length_dw) < 32)
1111                         break;
1112         }
1113
1114         if (vm->use_cpu_for_update) {
1115                 /* Flush HDP */
1116                 mb();
1117                 amdgpu_asic_flush_hdp(adev, NULL);
1118         } else if (params.ib->length_dw == 0) {
1119                 amdgpu_job_free(job);
1120         } else {
1121                 struct amdgpu_bo *root = vm->root.base.bo;
1122                 struct amdgpu_ring *ring;
1123                 struct dma_fence *fence;
1124
1125                 ring = container_of(vm->entity.rq->sched, struct amdgpu_ring,
1126                                     sched);
1127
1128                 amdgpu_ring_pad_ib(ring, params.ib);
1129                 amdgpu_sync_resv(adev, &job->sync, root->tbo.resv,
1130                                  AMDGPU_FENCE_OWNER_VM, false);
1131                 WARN_ON(params.ib->length_dw > ndw);
1132                 r = amdgpu_job_submit(job, &vm->entity, AMDGPU_FENCE_OWNER_VM,
1133                                       &fence);
1134                 if (r)
1135                         goto error;
1136
1137                 amdgpu_bo_fence(root, fence, true);
1138                 dma_fence_put(vm->last_update);
1139                 vm->last_update = fence;
1140         }
1141
1142         if (!list_empty(&vm->relocated))
1143                 goto restart;
1144
1145         return 0;
1146
1147 error:
1148         amdgpu_vm_invalidate_level(adev, vm, &vm->root,
1149                                    adev->vm_manager.root_level);
1150         amdgpu_job_free(job);
1151         return r;
1152 }
1153
1154 /**
1155  * amdgpu_vm_find_entry - find the entry for an address
1156  *
1157  * @p: see amdgpu_pte_update_params definition
1158  * @addr: virtual address in question
1159  * @entry: resulting entry or NULL
1160  * @parent: parent entry
1161  *
1162  * Find the vm_pt entry and it's parent for the given address.
1163  */
1164 void amdgpu_vm_get_entry(struct amdgpu_pte_update_params *p, uint64_t addr,
1165                          struct amdgpu_vm_pt **entry,
1166                          struct amdgpu_vm_pt **parent)
1167 {
1168         unsigned level = p->adev->vm_manager.root_level;
1169
1170         *parent = NULL;
1171         *entry = &p->vm->root;
1172         while ((*entry)->entries) {
1173                 unsigned shift = amdgpu_vm_level_shift(p->adev, level++);
1174
1175                 *parent = *entry;
1176                 *entry = &(*entry)->entries[addr >> shift];
1177                 addr &= (1ULL << shift) - 1;
1178         }
1179
1180         if (level != AMDGPU_VM_PTB)
1181                 *entry = NULL;
1182 }
1183
1184 /**
1185  * amdgpu_vm_handle_huge_pages - handle updating the PD with huge pages
1186  *
1187  * @p: see amdgpu_pte_update_params definition
1188  * @entry: vm_pt entry to check
1189  * @parent: parent entry
1190  * @nptes: number of PTEs updated with this operation
1191  * @dst: destination address where the PTEs should point to
1192  * @flags: access flags fro the PTEs
1193  *
1194  * Check if we can update the PD with a huge page.
1195  */
1196 static void amdgpu_vm_handle_huge_pages(struct amdgpu_pte_update_params *p,
1197                                         struct amdgpu_vm_pt *entry,
1198                                         struct amdgpu_vm_pt *parent,
1199                                         unsigned nptes, uint64_t dst,
1200                                         uint64_t flags)
1201 {
1202         uint64_t pde;
1203
1204         /* In the case of a mixed PT the PDE must point to it*/
1205         if (p->adev->asic_type >= CHIP_VEGA10 && !p->src &&
1206             nptes == AMDGPU_VM_PTE_COUNT(p->adev)) {
1207                 /* Set the huge page flag to stop scanning at this PDE */
1208                 flags |= AMDGPU_PDE_PTE;
1209         }
1210
1211         if (!(flags & AMDGPU_PDE_PTE)) {
1212                 if (entry->huge) {
1213                         /* Add the entry to the relocated list to update it. */
1214                         entry->huge = false;
1215                         list_move(&entry->base.vm_status, &p->vm->relocated);
1216                 }
1217                 return;
1218         }
1219
1220         entry->huge = true;
1221         amdgpu_gmc_get_vm_pde(p->adev, AMDGPU_VM_PDB0, &dst, &flags);
1222
1223         pde = (entry - parent->entries) * 8;
1224         if (parent->base.bo->shadow)
1225                 p->func(p, parent->base.bo->shadow, pde, dst, 1, 0, flags);
1226         p->func(p, parent->base.bo, pde, dst, 1, 0, flags);
1227 }
1228
1229 /**
1230  * amdgpu_vm_update_ptes - make sure that page tables are valid
1231  *
1232  * @params: see amdgpu_pte_update_params definition
1233  * @start: start of GPU address range
1234  * @end: end of GPU address range
1235  * @dst: destination address to map to, the next dst inside the function
1236  * @flags: mapping flags
1237  *
1238  * Update the page tables in the range @start - @end.
1239  *
1240  * Returns:
1241  * 0 for success, -EINVAL for failure.
1242  */
1243 static int amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
1244                                   uint64_t start, uint64_t end,
1245                                   uint64_t dst, uint64_t flags)
1246 {
1247         struct amdgpu_device *adev = params->adev;
1248         const uint64_t mask = AMDGPU_VM_PTE_COUNT(adev) - 1;
1249
1250         uint64_t addr, pe_start;
1251         struct amdgpu_bo *pt;
1252         unsigned nptes;
1253
1254         /* walk over the address space and update the page tables */
1255         for (addr = start; addr < end; addr += nptes,
1256              dst += nptes * AMDGPU_GPU_PAGE_SIZE) {
1257                 struct amdgpu_vm_pt *entry, *parent;
1258
1259                 amdgpu_vm_get_entry(params, addr, &entry, &parent);
1260                 if (!entry)
1261                         return -ENOENT;
1262
1263                 if ((addr & ~mask) == (end & ~mask))
1264                         nptes = end - addr;
1265                 else
1266                         nptes = AMDGPU_VM_PTE_COUNT(adev) - (addr & mask);
1267
1268                 amdgpu_vm_handle_huge_pages(params, entry, parent,
1269                                             nptes, dst, flags);
1270                 /* We don't need to update PTEs for huge pages */
1271                 if (entry->huge)
1272                         continue;
1273
1274                 pt = entry->base.bo;
1275                 pe_start = (addr & mask) * 8;
1276                 if (pt->shadow)
1277                         params->func(params, pt->shadow, pe_start, dst, nptes,
1278                                      AMDGPU_GPU_PAGE_SIZE, flags);
1279                 params->func(params, pt, pe_start, dst, nptes,
1280                              AMDGPU_GPU_PAGE_SIZE, flags);
1281         }
1282
1283         return 0;
1284 }
1285
1286 /*
1287  * amdgpu_vm_frag_ptes - add fragment information to PTEs
1288  *
1289  * @params: see amdgpu_pte_update_params definition
1290  * @vm: requested vm
1291  * @start: first PTE to handle
1292  * @end: last PTE to handle
1293  * @dst: addr those PTEs should point to
1294  * @flags: hw mapping flags
1295  *
1296  * Returns:
1297  * 0 for success, -EINVAL for failure.
1298  */
1299 static int amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params  *params,
1300                                 uint64_t start, uint64_t end,
1301                                 uint64_t dst, uint64_t flags)
1302 {
1303         /**
1304          * The MC L1 TLB supports variable sized pages, based on a fragment
1305          * field in the PTE. When this field is set to a non-zero value, page
1306          * granularity is increased from 4KB to (1 << (12 + frag)). The PTE
1307          * flags are considered valid for all PTEs within the fragment range
1308          * and corresponding mappings are assumed to be physically contiguous.
1309          *
1310          * The L1 TLB can store a single PTE for the whole fragment,
1311          * significantly increasing the space available for translation
1312          * caching. This leads to large improvements in throughput when the
1313          * TLB is under pressure.
1314          *
1315          * The L2 TLB distributes small and large fragments into two
1316          * asymmetric partitions. The large fragment cache is significantly
1317          * larger. Thus, we try to use large fragments wherever possible.
1318          * Userspace can support this by aligning virtual base address and
1319          * allocation size to the fragment size.
1320          */
1321         unsigned max_frag = params->adev->vm_manager.fragment_size;
1322         int r;
1323
1324         /* system pages are non continuously */
1325         if (params->src || !(flags & AMDGPU_PTE_VALID))
1326                 return amdgpu_vm_update_ptes(params, start, end, dst, flags);
1327
1328         while (start != end) {
1329                 uint64_t frag_flags, frag_end;
1330                 unsigned frag;
1331
1332                 /* This intentionally wraps around if no bit is set */
1333                 frag = min((unsigned)ffs(start) - 1,
1334                            (unsigned)fls64(end - start) - 1);
1335                 if (frag >= max_frag) {
1336                         frag_flags = AMDGPU_PTE_FRAG(max_frag);
1337                         frag_end = end & ~((1ULL << max_frag) - 1);
1338                 } else {
1339                         frag_flags = AMDGPU_PTE_FRAG(frag);
1340                         frag_end = start + (1 << frag);
1341                 }
1342
1343                 r = amdgpu_vm_update_ptes(params, start, frag_end, dst,
1344                                           flags | frag_flags);
1345                 if (r)
1346                         return r;
1347
1348                 dst += (frag_end - start) * AMDGPU_GPU_PAGE_SIZE;
1349                 start = frag_end;
1350         }
1351
1352         return 0;
1353 }
1354
1355 /**
1356  * amdgpu_vm_bo_update_mapping - update a mapping in the vm page table
1357  *
1358  * @adev: amdgpu_device pointer
1359  * @exclusive: fence we need to sync to
1360  * @pages_addr: DMA addresses to use for mapping
1361  * @vm: requested vm
1362  * @start: start of mapped range
1363  * @last: last mapped entry
1364  * @flags: flags for the entries
1365  * @addr: addr to set the area to
1366  * @fence: optional resulting fence
1367  *
1368  * Fill in the page table entries between @start and @last.
1369  *
1370  * Returns:
1371  * 0 for success, -EINVAL for failure.
1372  */
1373 static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
1374                                        struct dma_fence *exclusive,
1375                                        dma_addr_t *pages_addr,
1376                                        struct amdgpu_vm *vm,
1377                                        uint64_t start, uint64_t last,
1378                                        uint64_t flags, uint64_t addr,
1379                                        struct dma_fence **fence)
1380 {
1381         struct amdgpu_ring *ring;
1382         void *owner = AMDGPU_FENCE_OWNER_VM;
1383         unsigned nptes, ncmds, ndw;
1384         struct amdgpu_job *job;
1385         struct amdgpu_pte_update_params params;
1386         struct dma_fence *f = NULL;
1387         int r;
1388
1389         memset(&params, 0, sizeof(params));
1390         params.adev = adev;
1391         params.vm = vm;
1392
1393         /* sync to everything on unmapping */
1394         if (!(flags & AMDGPU_PTE_VALID))
1395                 owner = AMDGPU_FENCE_OWNER_UNDEFINED;
1396
1397         if (vm->use_cpu_for_update) {
1398                 /* params.src is used as flag to indicate system Memory */
1399                 if (pages_addr)
1400                         params.src = ~0;
1401
1402                 /* Wait for PT BOs to be free. PTs share the same resv. object
1403                  * as the root PD BO
1404                  */
1405                 r = amdgpu_vm_wait_pd(adev, vm, owner);
1406                 if (unlikely(r))
1407                         return r;
1408
1409                 params.func = amdgpu_vm_cpu_set_ptes;
1410                 params.pages_addr = pages_addr;
1411                 return amdgpu_vm_frag_ptes(&params, start, last + 1,
1412                                            addr, flags);
1413         }
1414
1415         ring = container_of(vm->entity.rq->sched, struct amdgpu_ring, sched);
1416
1417         nptes = last - start + 1;
1418
1419         /*
1420          * reserve space for two commands every (1 << BLOCK_SIZE)
1421          *  entries or 2k dwords (whatever is smaller)
1422          *
1423          * The second command is for the shadow pagetables.
1424          */
1425         if (vm->root.base.bo->shadow)
1426                 ncmds = ((nptes >> min(adev->vm_manager.block_size, 11u)) + 1) * 2;
1427         else
1428                 ncmds = ((nptes >> min(adev->vm_manager.block_size, 11u)) + 1);
1429
1430         /* padding, etc. */
1431         ndw = 64;
1432
1433         if (pages_addr) {
1434                 /* copy commands needed */
1435                 ndw += ncmds * adev->vm_manager.vm_pte_funcs->copy_pte_num_dw;
1436
1437                 /* and also PTEs */
1438                 ndw += nptes * 2;
1439
1440                 params.func = amdgpu_vm_do_copy_ptes;
1441
1442         } else {
1443                 /* set page commands needed */
1444                 ndw += ncmds * 10;
1445
1446                 /* extra commands for begin/end fragments */
1447                 if (vm->root.base.bo->shadow)
1448                         ndw += 2 * 10 * adev->vm_manager.fragment_size * 2;
1449                 else
1450                         ndw += 2 * 10 * adev->vm_manager.fragment_size;
1451
1452                 params.func = amdgpu_vm_do_set_ptes;
1453         }
1454
1455         r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
1456         if (r)
1457                 return r;
1458
1459         params.ib = &job->ibs[0];
1460
1461         if (pages_addr) {
1462                 uint64_t *pte;
1463                 unsigned i;
1464
1465                 /* Put the PTEs at the end of the IB. */
1466                 i = ndw - nptes * 2;
1467                 pte= (uint64_t *)&(job->ibs->ptr[i]);
1468                 params.src = job->ibs->gpu_addr + i * 4;
1469
1470                 for (i = 0; i < nptes; ++i) {
1471                         pte[i] = amdgpu_vm_map_gart(pages_addr, addr + i *
1472                                                     AMDGPU_GPU_PAGE_SIZE);
1473                         pte[i] |= flags;
1474                 }
1475                 addr = 0;
1476         }
1477
1478         r = amdgpu_sync_fence(adev, &job->sync, exclusive, false);
1479         if (r)
1480                 goto error_free;
1481
1482         r = amdgpu_sync_resv(adev, &job->sync, vm->root.base.bo->tbo.resv,
1483                              owner, false);
1484         if (r)
1485                 goto error_free;
1486
1487         r = reservation_object_reserve_shared(vm->root.base.bo->tbo.resv);
1488         if (r)
1489                 goto error_free;
1490
1491         r = amdgpu_vm_frag_ptes(&params, start, last + 1, addr, flags);
1492         if (r)
1493                 goto error_free;
1494
1495         amdgpu_ring_pad_ib(ring, params.ib);
1496         WARN_ON(params.ib->length_dw > ndw);
1497         r = amdgpu_job_submit(job, &vm->entity, AMDGPU_FENCE_OWNER_VM, &f);
1498         if (r)
1499                 goto error_free;
1500
1501         amdgpu_bo_fence(vm->root.base.bo, f, true);
1502         dma_fence_put(*fence);
1503         *fence = f;
1504         return 0;
1505
1506 error_free:
1507         amdgpu_job_free(job);
1508         return r;
1509 }
1510
1511 /**
1512  * amdgpu_vm_bo_split_mapping - split a mapping into smaller chunks
1513  *
1514  * @adev: amdgpu_device pointer
1515  * @exclusive: fence we need to sync to
1516  * @pages_addr: DMA addresses to use for mapping
1517  * @vm: requested vm
1518  * @mapping: mapped range and flags to use for the update
1519  * @flags: HW flags for the mapping
1520  * @nodes: array of drm_mm_nodes with the MC addresses
1521  * @fence: optional resulting fence
1522  *
1523  * Split the mapping into smaller chunks so that each update fits
1524  * into a SDMA IB.
1525  *
1526  * Returns:
1527  * 0 for success, -EINVAL for failure.
1528  */
1529 static int amdgpu_vm_bo_split_mapping(struct amdgpu_device *adev,
1530                                       struct dma_fence *exclusive,
1531                                       dma_addr_t *pages_addr,
1532                                       struct amdgpu_vm *vm,
1533                                       struct amdgpu_bo_va_mapping *mapping,
1534                                       uint64_t flags,
1535                                       struct drm_mm_node *nodes,
1536                                       struct dma_fence **fence)
1537 {
1538         unsigned min_linear_pages = 1 << adev->vm_manager.fragment_size;
1539         uint64_t pfn, start = mapping->start;
1540         int r;
1541
1542         /* normally,bo_va->flags only contians READABLE and WIRTEABLE bit go here
1543          * but in case of something, we filter the flags in first place
1544          */
1545         if (!(mapping->flags & AMDGPU_PTE_READABLE))
1546                 flags &= ~AMDGPU_PTE_READABLE;
1547         if (!(mapping->flags & AMDGPU_PTE_WRITEABLE))
1548                 flags &= ~AMDGPU_PTE_WRITEABLE;
1549
1550         flags &= ~AMDGPU_PTE_EXECUTABLE;
1551         flags |= mapping->flags & AMDGPU_PTE_EXECUTABLE;
1552
1553         flags &= ~AMDGPU_PTE_MTYPE_MASK;
1554         flags |= (mapping->flags & AMDGPU_PTE_MTYPE_MASK);
1555
1556         if ((mapping->flags & AMDGPU_PTE_PRT) &&
1557             (adev->asic_type >= CHIP_VEGA10)) {
1558                 flags |= AMDGPU_PTE_PRT;
1559                 flags &= ~AMDGPU_PTE_VALID;
1560         }
1561
1562         trace_amdgpu_vm_bo_update(mapping);
1563
1564         pfn = mapping->offset >> PAGE_SHIFT;
1565         if (nodes) {
1566                 while (pfn >= nodes->size) {
1567                         pfn -= nodes->size;
1568                         ++nodes;
1569                 }
1570         }
1571
1572         do {
1573                 dma_addr_t *dma_addr = NULL;
1574                 uint64_t max_entries;
1575                 uint64_t addr, last;
1576
1577                 if (nodes) {
1578                         addr = nodes->start << PAGE_SHIFT;
1579                         max_entries = (nodes->size - pfn) *
1580                                 AMDGPU_GPU_PAGES_IN_CPU_PAGE;
1581                 } else {
1582                         addr = 0;
1583                         max_entries = S64_MAX;
1584                 }
1585
1586                 if (pages_addr) {
1587                         uint64_t count;
1588
1589                         max_entries = min(max_entries, 16ull * 1024ull);
1590                         for (count = 1;
1591                              count < max_entries / AMDGPU_GPU_PAGES_IN_CPU_PAGE;
1592                              ++count) {
1593                                 uint64_t idx = pfn + count;
1594
1595                                 if (pages_addr[idx] !=
1596                                     (pages_addr[idx - 1] + PAGE_SIZE))
1597                                         break;
1598                         }
1599
1600                         if (count < min_linear_pages) {
1601                                 addr = pfn << PAGE_SHIFT;
1602                                 dma_addr = pages_addr;
1603                         } else {
1604                                 addr = pages_addr[pfn];
1605                                 max_entries = count * AMDGPU_GPU_PAGES_IN_CPU_PAGE;
1606                         }
1607
1608                 } else if (flags & AMDGPU_PTE_VALID) {
1609                         addr += adev->vm_manager.vram_base_offset;
1610                         addr += pfn << PAGE_SHIFT;
1611                 }
1612
1613                 last = min((uint64_t)mapping->last, start + max_entries - 1);
1614                 r = amdgpu_vm_bo_update_mapping(adev, exclusive, dma_addr, vm,
1615                                                 start, last, flags, addr,
1616                                                 fence);
1617                 if (r)
1618                         return r;
1619
1620                 pfn += (last - start + 1) / AMDGPU_GPU_PAGES_IN_CPU_PAGE;
1621                 if (nodes && nodes->size == pfn) {
1622                         pfn = 0;
1623                         ++nodes;
1624                 }
1625                 start = last + 1;
1626
1627         } while (unlikely(start != mapping->last + 1));
1628
1629         return 0;
1630 }
1631
1632 /**
1633  * amdgpu_vm_bo_update - update all BO mappings in the vm page table
1634  *
1635  * @adev: amdgpu_device pointer
1636  * @bo_va: requested BO and VM object
1637  * @clear: if true clear the entries
1638  *
1639  * Fill in the page table entries for @bo_va.
1640  *
1641  * Returns:
1642  * 0 for success, -EINVAL for failure.
1643  */
1644 int amdgpu_vm_bo_update(struct amdgpu_device *adev,
1645                         struct amdgpu_bo_va *bo_va,
1646                         bool clear)
1647 {
1648         struct amdgpu_bo *bo = bo_va->base.bo;
1649         struct amdgpu_vm *vm = bo_va->base.vm;
1650         struct amdgpu_bo_va_mapping *mapping;
1651         dma_addr_t *pages_addr = NULL;
1652         struct ttm_mem_reg *mem;
1653         struct drm_mm_node *nodes;
1654         struct dma_fence *exclusive, **last_update;
1655         uint64_t flags;
1656         int r;
1657
1658         if (clear || !bo) {
1659                 mem = NULL;
1660                 nodes = NULL;
1661                 exclusive = NULL;
1662         } else {
1663                 struct ttm_dma_tt *ttm;
1664
1665                 mem = &bo->tbo.mem;
1666                 nodes = mem->mm_node;
1667                 if (mem->mem_type == TTM_PL_TT) {
1668                         ttm = container_of(bo->tbo.ttm, struct ttm_dma_tt, ttm);
1669                         pages_addr = ttm->dma_address;
1670                 }
1671                 exclusive = reservation_object_get_excl(bo->tbo.resv);
1672         }
1673
1674         if (bo)
1675                 flags = amdgpu_ttm_tt_pte_flags(adev, bo->tbo.ttm, mem);
1676         else
1677                 flags = 0x0;
1678
1679         if (clear || (bo && bo->tbo.resv == vm->root.base.bo->tbo.resv))
1680                 last_update = &vm->last_update;
1681         else
1682                 last_update = &bo_va->last_pt_update;
1683
1684         if (!clear && bo_va->base.moved) {
1685                 bo_va->base.moved = false;
1686                 list_splice_init(&bo_va->valids, &bo_va->invalids);
1687
1688         } else if (bo_va->cleared != clear) {
1689                 list_splice_init(&bo_va->valids, &bo_va->invalids);
1690         }
1691
1692         list_for_each_entry(mapping, &bo_va->invalids, list) {
1693                 r = amdgpu_vm_bo_split_mapping(adev, exclusive, pages_addr, vm,
1694                                                mapping, flags, nodes,
1695                                                last_update);
1696                 if (r)
1697                         return r;
1698         }
1699
1700         if (vm->use_cpu_for_update) {
1701                 /* Flush HDP */
1702                 mb();
1703                 amdgpu_asic_flush_hdp(adev, NULL);
1704         }
1705
1706         spin_lock(&vm->moved_lock);
1707         list_del_init(&bo_va->base.vm_status);
1708         spin_unlock(&vm->moved_lock);
1709
1710         /* If the BO is not in its preferred location add it back to
1711          * the evicted list so that it gets validated again on the
1712          * next command submission.
1713          */
1714         if (bo && bo->tbo.resv == vm->root.base.bo->tbo.resv) {
1715                 uint32_t mem_type = bo->tbo.mem.mem_type;
1716
1717                 if (!(bo->preferred_domains & amdgpu_mem_type_to_domain(mem_type)))
1718                         list_add_tail(&bo_va->base.vm_status, &vm->evicted);
1719                 else
1720                         list_add(&bo_va->base.vm_status, &vm->idle);
1721         }
1722
1723         list_splice_init(&bo_va->invalids, &bo_va->valids);
1724         bo_va->cleared = clear;
1725
1726         if (trace_amdgpu_vm_bo_mapping_enabled()) {
1727                 list_for_each_entry(mapping, &bo_va->valids, list)
1728                         trace_amdgpu_vm_bo_mapping(mapping);
1729         }
1730
1731         return 0;
1732 }
1733
1734 /**
1735  * amdgpu_vm_update_prt_state - update the global PRT state
1736  *
1737  * @adev: amdgpu_device pointer
1738  */
1739 static void amdgpu_vm_update_prt_state(struct amdgpu_device *adev)
1740 {
1741         unsigned long flags;
1742         bool enable;
1743
1744         spin_lock_irqsave(&adev->vm_manager.prt_lock, flags);
1745         enable = !!atomic_read(&adev->vm_manager.num_prt_users);
1746         adev->gmc.gmc_funcs->set_prt(adev, enable);
1747         spin_unlock_irqrestore(&adev->vm_manager.prt_lock, flags);
1748 }
1749
1750 /**
1751  * amdgpu_vm_prt_get - add a PRT user
1752  *
1753  * @adev: amdgpu_device pointer
1754  */
1755 static void amdgpu_vm_prt_get(struct amdgpu_device *adev)
1756 {
1757         if (!adev->gmc.gmc_funcs->set_prt)
1758                 return;
1759
1760         if (atomic_inc_return(&adev->vm_manager.num_prt_users) == 1)
1761                 amdgpu_vm_update_prt_state(adev);
1762 }
1763
1764 /**
1765  * amdgpu_vm_prt_put - drop a PRT user
1766  *
1767  * @adev: amdgpu_device pointer
1768  */
1769 static void amdgpu_vm_prt_put(struct amdgpu_device *adev)
1770 {
1771         if (atomic_dec_return(&adev->vm_manager.num_prt_users) == 0)
1772                 amdgpu_vm_update_prt_state(adev);
1773 }
1774
1775 /**
1776  * amdgpu_vm_prt_cb - callback for updating the PRT status
1777  *
1778  * @fence: fence for the callback
1779  * @_cb: the callback function
1780  */
1781 static void amdgpu_vm_prt_cb(struct dma_fence *fence, struct dma_fence_cb *_cb)
1782 {
1783         struct amdgpu_prt_cb *cb = container_of(_cb, struct amdgpu_prt_cb, cb);
1784
1785         amdgpu_vm_prt_put(cb->adev);
1786         kfree(cb);
1787 }
1788
1789 /**
1790  * amdgpu_vm_add_prt_cb - add callback for updating the PRT status
1791  *
1792  * @adev: amdgpu_device pointer
1793  * @fence: fence for the callback
1794  */
1795 static void amdgpu_vm_add_prt_cb(struct amdgpu_device *adev,
1796                                  struct dma_fence *fence)
1797 {
1798         struct amdgpu_prt_cb *cb;
1799
1800         if (!adev->gmc.gmc_funcs->set_prt)
1801                 return;
1802
1803         cb = kmalloc(sizeof(struct amdgpu_prt_cb), GFP_KERNEL);
1804         if (!cb) {
1805                 /* Last resort when we are OOM */
1806                 if (fence)
1807                         dma_fence_wait(fence, false);
1808
1809                 amdgpu_vm_prt_put(adev);
1810         } else {
1811                 cb->adev = adev;
1812                 if (!fence || dma_fence_add_callback(fence, &cb->cb,
1813                                                      amdgpu_vm_prt_cb))
1814                         amdgpu_vm_prt_cb(fence, &cb->cb);
1815         }
1816 }
1817
1818 /**
1819  * amdgpu_vm_free_mapping - free a mapping
1820  *
1821  * @adev: amdgpu_device pointer
1822  * @vm: requested vm
1823  * @mapping: mapping to be freed
1824  * @fence: fence of the unmap operation
1825  *
1826  * Free a mapping and make sure we decrease the PRT usage count if applicable.
1827  */
1828 static void amdgpu_vm_free_mapping(struct amdgpu_device *adev,
1829                                    struct amdgpu_vm *vm,
1830                                    struct amdgpu_bo_va_mapping *mapping,
1831                                    struct dma_fence *fence)
1832 {
1833         if (mapping->flags & AMDGPU_PTE_PRT)
1834                 amdgpu_vm_add_prt_cb(adev, fence);
1835         kfree(mapping);
1836 }
1837
1838 /**
1839  * amdgpu_vm_prt_fini - finish all prt mappings
1840  *
1841  * @adev: amdgpu_device pointer
1842  * @vm: requested vm
1843  *
1844  * Register a cleanup callback to disable PRT support after VM dies.
1845  */
1846 static void amdgpu_vm_prt_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
1847 {
1848         struct reservation_object *resv = vm->root.base.bo->tbo.resv;
1849         struct dma_fence *excl, **shared;
1850         unsigned i, shared_count;
1851         int r;
1852
1853         r = reservation_object_get_fences_rcu(resv, &excl,
1854                                               &shared_count, &shared);
1855         if (r) {
1856                 /* Not enough memory to grab the fence list, as last resort
1857                  * block for all the fences to complete.
1858                  */
1859                 reservation_object_wait_timeout_rcu(resv, true, false,
1860                                                     MAX_SCHEDULE_TIMEOUT);
1861                 return;
1862         }
1863
1864         /* Add a callback for each fence in the reservation object */
1865         amdgpu_vm_prt_get(adev);
1866         amdgpu_vm_add_prt_cb(adev, excl);
1867
1868         for (i = 0; i < shared_count; ++i) {
1869                 amdgpu_vm_prt_get(adev);
1870                 amdgpu_vm_add_prt_cb(adev, shared[i]);
1871         }
1872
1873         kfree(shared);
1874 }
1875
1876 /**
1877  * amdgpu_vm_clear_freed - clear freed BOs in the PT
1878  *
1879  * @adev: amdgpu_device pointer
1880  * @vm: requested vm
1881  * @fence: optional resulting fence (unchanged if no work needed to be done
1882  * or if an error occurred)
1883  *
1884  * Make sure all freed BOs are cleared in the PT.
1885  * PTs have to be reserved and mutex must be locked!
1886  *
1887  * Returns:
1888  * 0 for success.
1889  *
1890  */
1891 int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
1892                           struct amdgpu_vm *vm,
1893                           struct dma_fence **fence)
1894 {
1895         struct amdgpu_bo_va_mapping *mapping;
1896         uint64_t init_pte_value = 0;
1897         struct dma_fence *f = NULL;
1898         int r;
1899
1900         while (!list_empty(&vm->freed)) {
1901                 mapping = list_first_entry(&vm->freed,
1902                         struct amdgpu_bo_va_mapping, list);
1903                 list_del(&mapping->list);
1904
1905                 if (vm->pte_support_ats && mapping->start < AMDGPU_VA_HOLE_START)
1906                         init_pte_value = AMDGPU_PTE_DEFAULT_ATC;
1907
1908                 r = amdgpu_vm_bo_update_mapping(adev, NULL, NULL, vm,
1909                                                 mapping->start, mapping->last,
1910                                                 init_pte_value, 0, &f);
1911                 amdgpu_vm_free_mapping(adev, vm, mapping, f);
1912                 if (r) {
1913                         dma_fence_put(f);
1914                         return r;
1915                 }
1916         }
1917
1918         if (fence && f) {
1919                 dma_fence_put(*fence);
1920                 *fence = f;
1921         } else {
1922                 dma_fence_put(f);
1923         }
1924
1925         return 0;
1926
1927 }
1928
1929 /**
1930  * amdgpu_vm_handle_moved - handle moved BOs in the PT
1931  *
1932  * @adev: amdgpu_device pointer
1933  * @vm: requested vm
1934  *
1935  * Make sure all BOs which are moved are updated in the PTs.
1936  *
1937  * Returns:
1938  * 0 for success.
1939  *
1940  * PTs have to be reserved!
1941  */
1942 int amdgpu_vm_handle_moved(struct amdgpu_device *adev,
1943                            struct amdgpu_vm *vm)
1944 {
1945         struct amdgpu_bo_va *bo_va, *tmp;
1946         struct list_head moved;
1947         bool clear;
1948         int r;
1949
1950         INIT_LIST_HEAD(&moved);
1951         spin_lock(&vm->moved_lock);
1952         list_splice_init(&vm->moved, &moved);
1953         spin_unlock(&vm->moved_lock);
1954
1955         list_for_each_entry_safe(bo_va, tmp, &moved, base.vm_status) {
1956                 struct reservation_object *resv = bo_va->base.bo->tbo.resv;
1957
1958                 /* Per VM BOs never need to bo cleared in the page tables */
1959                 if (resv == vm->root.base.bo->tbo.resv)
1960                         clear = false;
1961                 /* Try to reserve the BO to avoid clearing its ptes */
1962                 else if (!amdgpu_vm_debug && reservation_object_trylock(resv))
1963                         clear = false;
1964                 /* Somebody else is using the BO right now */
1965                 else
1966                         clear = true;
1967
1968                 r = amdgpu_vm_bo_update(adev, bo_va, clear);
1969                 if (r) {
1970                         spin_lock(&vm->moved_lock);
1971                         list_splice(&moved, &vm->moved);
1972                         spin_unlock(&vm->moved_lock);
1973                         return r;
1974                 }
1975
1976                 if (!clear && resv != vm->root.base.bo->tbo.resv)
1977                         reservation_object_unlock(resv);
1978
1979         }
1980
1981         return 0;
1982 }
1983
1984 /**
1985  * amdgpu_vm_bo_add - add a bo to a specific vm
1986  *
1987  * @adev: amdgpu_device pointer
1988  * @vm: requested vm
1989  * @bo: amdgpu buffer object
1990  *
1991  * Add @bo into the requested vm.
1992  * Add @bo to the list of bos associated with the vm
1993  *
1994  * Returns:
1995  * Newly added bo_va or NULL for failure
1996  *
1997  * Object has to be reserved!
1998  */
1999 struct amdgpu_bo_va *amdgpu_vm_bo_add(struct amdgpu_device *adev,
2000                                       struct amdgpu_vm *vm,
2001                                       struct amdgpu_bo *bo)
2002 {
2003         struct amdgpu_bo_va *bo_va;
2004
2005         bo_va = kzalloc(sizeof(struct amdgpu_bo_va), GFP_KERNEL);
2006         if (bo_va == NULL) {
2007                 return NULL;
2008         }
2009         amdgpu_vm_bo_base_init(&bo_va->base, vm, bo);
2010
2011         bo_va->ref_count = 1;
2012         INIT_LIST_HEAD(&bo_va->valids);
2013         INIT_LIST_HEAD(&bo_va->invalids);
2014
2015         return bo_va;
2016 }
2017
2018
2019 /**
2020  * amdgpu_vm_bo_insert_mapping - insert a new mapping
2021  *
2022  * @adev: amdgpu_device pointer
2023  * @bo_va: bo_va to store the address
2024  * @mapping: the mapping to insert
2025  *
2026  * Insert a new mapping into all structures.
2027  */
2028 static void amdgpu_vm_bo_insert_map(struct amdgpu_device *adev,
2029                                     struct amdgpu_bo_va *bo_va,
2030                                     struct amdgpu_bo_va_mapping *mapping)
2031 {
2032         struct amdgpu_vm *vm = bo_va->base.vm;
2033         struct amdgpu_bo *bo = bo_va->base.bo;
2034
2035         mapping->bo_va = bo_va;
2036         list_add(&mapping->list, &bo_va->invalids);
2037         amdgpu_vm_it_insert(mapping, &vm->va);
2038
2039         if (mapping->flags & AMDGPU_PTE_PRT)
2040                 amdgpu_vm_prt_get(adev);
2041
2042         if (bo && bo->tbo.resv == vm->root.base.bo->tbo.resv &&
2043             !bo_va->base.moved) {
2044                 spin_lock(&vm->moved_lock);
2045                 list_move(&bo_va->base.vm_status, &vm->moved);
2046                 spin_unlock(&vm->moved_lock);
2047         }
2048         trace_amdgpu_vm_bo_map(bo_va, mapping);
2049 }
2050
2051 /**
2052  * amdgpu_vm_bo_map - map bo inside a vm
2053  *
2054  * @adev: amdgpu_device pointer
2055  * @bo_va: bo_va to store the address
2056  * @saddr: where to map the BO
2057  * @offset: requested offset in the BO
2058  * @size: BO size in bytes
2059  * @flags: attributes of pages (read/write/valid/etc.)
2060  *
2061  * Add a mapping of the BO at the specefied addr into the VM.
2062  *
2063  * Returns:
2064  * 0 for success, error for failure.
2065  *
2066  * Object has to be reserved and unreserved outside!
2067  */
2068 int amdgpu_vm_bo_map(struct amdgpu_device *adev,
2069                      struct amdgpu_bo_va *bo_va,
2070                      uint64_t saddr, uint64_t offset,
2071                      uint64_t size, uint64_t flags)
2072 {
2073         struct amdgpu_bo_va_mapping *mapping, *tmp;
2074         struct amdgpu_bo *bo = bo_va->base.bo;
2075         struct amdgpu_vm *vm = bo_va->base.vm;
2076         uint64_t eaddr;
2077
2078         /* validate the parameters */
2079         if (saddr & AMDGPU_GPU_PAGE_MASK || offset & AMDGPU_GPU_PAGE_MASK ||
2080             size == 0 || size & AMDGPU_GPU_PAGE_MASK)
2081                 return -EINVAL;
2082
2083         /* make sure object fit at this offset */
2084         eaddr = saddr + size - 1;
2085         if (saddr >= eaddr ||
2086             (bo && offset + size > amdgpu_bo_size(bo)))
2087                 return -EINVAL;
2088
2089         saddr /= AMDGPU_GPU_PAGE_SIZE;
2090         eaddr /= AMDGPU_GPU_PAGE_SIZE;
2091
2092         tmp = amdgpu_vm_it_iter_first(&vm->va, saddr, eaddr);
2093         if (tmp) {
2094                 /* bo and tmp overlap, invalid addr */
2095                 dev_err(adev->dev, "bo %p va 0x%010Lx-0x%010Lx conflict with "
2096                         "0x%010Lx-0x%010Lx\n", bo, saddr, eaddr,
2097                         tmp->start, tmp->last + 1);
2098                 return -EINVAL;
2099         }
2100
2101         mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);
2102         if (!mapping)
2103                 return -ENOMEM;
2104
2105         mapping->start = saddr;
2106         mapping->last = eaddr;
2107         mapping->offset = offset;
2108         mapping->flags = flags;
2109
2110         amdgpu_vm_bo_insert_map(adev, bo_va, mapping);
2111
2112         return 0;
2113 }
2114
2115 /**
2116  * amdgpu_vm_bo_replace_map - map bo inside a vm, replacing existing mappings
2117  *
2118  * @adev: amdgpu_device pointer
2119  * @bo_va: bo_va to store the address
2120  * @saddr: where to map the BO
2121  * @offset: requested offset in the BO
2122  * @size: BO size in bytes
2123  * @flags: attributes of pages (read/write/valid/etc.)
2124  *
2125  * Add a mapping of the BO at the specefied addr into the VM. Replace existing
2126  * mappings as we do so.
2127  *
2128  * Returns:
2129  * 0 for success, error for failure.
2130  *
2131  * Object has to be reserved and unreserved outside!
2132  */
2133 int amdgpu_vm_bo_replace_map(struct amdgpu_device *adev,
2134                              struct amdgpu_bo_va *bo_va,
2135                              uint64_t saddr, uint64_t offset,
2136                              uint64_t size, uint64_t flags)
2137 {
2138         struct amdgpu_bo_va_mapping *mapping;
2139         struct amdgpu_bo *bo = bo_va->base.bo;
2140         uint64_t eaddr;
2141         int r;
2142
2143         /* validate the parameters */
2144         if (saddr & AMDGPU_GPU_PAGE_MASK || offset & AMDGPU_GPU_PAGE_MASK ||
2145             size == 0 || size & AMDGPU_GPU_PAGE_MASK)
2146                 return -EINVAL;
2147
2148         /* make sure object fit at this offset */
2149         eaddr = saddr + size - 1;
2150         if (saddr >= eaddr ||
2151             (bo && offset + size > amdgpu_bo_size(bo)))
2152                 return -EINVAL;
2153
2154         /* Allocate all the needed memory */
2155         mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);
2156         if (!mapping)
2157                 return -ENOMEM;
2158
2159         r = amdgpu_vm_bo_clear_mappings(adev, bo_va->base.vm, saddr, size);
2160         if (r) {
2161                 kfree(mapping);
2162                 return r;
2163         }
2164
2165         saddr /= AMDGPU_GPU_PAGE_SIZE;
2166         eaddr /= AMDGPU_GPU_PAGE_SIZE;
2167
2168         mapping->start = saddr;
2169         mapping->last = eaddr;
2170         mapping->offset = offset;
2171         mapping->flags = flags;
2172
2173         amdgpu_vm_bo_insert_map(adev, bo_va, mapping);
2174
2175         return 0;
2176 }
2177
2178 /**
2179  * amdgpu_vm_bo_unmap - remove bo mapping from vm
2180  *
2181  * @adev: amdgpu_device pointer
2182  * @bo_va: bo_va to remove the address from
2183  * @saddr: where to the BO is mapped
2184  *
2185  * Remove a mapping of the BO at the specefied addr from the VM.
2186  *
2187  * Returns:
2188  * 0 for success, error for failure.
2189  *
2190  * Object has to be reserved and unreserved outside!
2191  */
2192 int amdgpu_vm_bo_unmap(struct amdgpu_device *adev,
2193                        struct amdgpu_bo_va *bo_va,
2194                        uint64_t saddr)
2195 {
2196         struct amdgpu_bo_va_mapping *mapping;
2197         struct amdgpu_vm *vm = bo_va->base.vm;
2198         bool valid = true;
2199
2200         saddr /= AMDGPU_GPU_PAGE_SIZE;
2201
2202         list_for_each_entry(mapping, &bo_va->valids, list) {
2203                 if (mapping->start == saddr)
2204                         break;
2205         }
2206
2207         if (&mapping->list == &bo_va->valids) {
2208                 valid = false;
2209
2210                 list_for_each_entry(mapping, &bo_va->invalids, list) {
2211                         if (mapping->start == saddr)
2212                                 break;
2213                 }
2214
2215                 if (&mapping->list == &bo_va->invalids)
2216                         return -ENOENT;
2217         }
2218
2219         list_del(&mapping->list);
2220         amdgpu_vm_it_remove(mapping, &vm->va);
2221         mapping->bo_va = NULL;
2222         trace_amdgpu_vm_bo_unmap(bo_va, mapping);
2223
2224         if (valid)
2225                 list_add(&mapping->list, &vm->freed);
2226         else
2227                 amdgpu_vm_free_mapping(adev, vm, mapping,
2228                                        bo_va->last_pt_update);
2229
2230         return 0;
2231 }
2232
2233 /**
2234  * amdgpu_vm_bo_clear_mappings - remove all mappings in a specific range
2235  *
2236  * @adev: amdgpu_device pointer
2237  * @vm: VM structure to use
2238  * @saddr: start of the range
2239  * @size: size of the range
2240  *
2241  * Remove all mappings in a range, split them as appropriate.
2242  *
2243  * Returns:
2244  * 0 for success, error for failure.
2245  */
2246 int amdgpu_vm_bo_clear_mappings(struct amdgpu_device *adev,
2247                                 struct amdgpu_vm *vm,
2248                                 uint64_t saddr, uint64_t size)
2249 {
2250         struct amdgpu_bo_va_mapping *before, *after, *tmp, *next;
2251         LIST_HEAD(removed);
2252         uint64_t eaddr;
2253
2254         eaddr = saddr + size - 1;
2255         saddr /= AMDGPU_GPU_PAGE_SIZE;
2256         eaddr /= AMDGPU_GPU_PAGE_SIZE;
2257
2258         /* Allocate all the needed memory */
2259         before = kzalloc(sizeof(*before), GFP_KERNEL);
2260         if (!before)
2261                 return -ENOMEM;
2262         INIT_LIST_HEAD(&before->list);
2263
2264         after = kzalloc(sizeof(*after), GFP_KERNEL);
2265         if (!after) {
2266                 kfree(before);
2267                 return -ENOMEM;
2268         }
2269         INIT_LIST_HEAD(&after->list);
2270
2271         /* Now gather all removed mappings */
2272         tmp = amdgpu_vm_it_iter_first(&vm->va, saddr, eaddr);
2273         while (tmp) {
2274                 /* Remember mapping split at the start */
2275                 if (tmp->start < saddr) {
2276                         before->start = tmp->start;
2277                         before->last = saddr - 1;
2278                         before->offset = tmp->offset;
2279                         before->flags = tmp->flags;
2280                         before->bo_va = tmp->bo_va;
2281                         list_add(&before->list, &tmp->bo_va->invalids);
2282                 }
2283
2284                 /* Remember mapping split at the end */
2285                 if (tmp->last > eaddr) {
2286                         after->start = eaddr + 1;
2287                         after->last = tmp->last;
2288                         after->offset = tmp->offset;
2289                         after->offset += after->start - tmp->start;
2290                         after->flags = tmp->flags;
2291                         after->bo_va = tmp->bo_va;
2292                         list_add(&after->list, &tmp->bo_va->invalids);
2293                 }
2294
2295                 list_del(&tmp->list);
2296                 list_add(&tmp->list, &removed);
2297
2298                 tmp = amdgpu_vm_it_iter_next(tmp, saddr, eaddr);
2299         }
2300
2301         /* And free them up */
2302         list_for_each_entry_safe(tmp, next, &removed, list) {
2303                 amdgpu_vm_it_remove(tmp, &vm->va);
2304                 list_del(&tmp->list);
2305
2306                 if (tmp->start < saddr)
2307                     tmp->start = saddr;
2308                 if (tmp->last > eaddr)
2309                     tmp->last = eaddr;
2310
2311                 tmp->bo_va = NULL;
2312                 list_add(&tmp->list, &vm->freed);
2313                 trace_amdgpu_vm_bo_unmap(NULL, tmp);
2314         }
2315
2316         /* Insert partial mapping before the range */
2317         if (!list_empty(&before->list)) {
2318                 amdgpu_vm_it_insert(before, &vm->va);
2319                 if (before->flags & AMDGPU_PTE_PRT)
2320                         amdgpu_vm_prt_get(adev);
2321         } else {
2322                 kfree(before);
2323         }
2324
2325         /* Insert partial mapping after the range */
2326         if (!list_empty(&after->list)) {
2327                 amdgpu_vm_it_insert(after, &vm->va);
2328                 if (after->flags & AMDGPU_PTE_PRT)
2329                         amdgpu_vm_prt_get(adev);
2330         } else {
2331                 kfree(after);
2332         }
2333
2334         return 0;
2335 }
2336
2337 /**
2338  * amdgpu_vm_bo_lookup_mapping - find mapping by address
2339  *
2340  * @vm: the requested VM
2341  * @addr: the address
2342  *
2343  * Find a mapping by it's address.
2344  *
2345  * Returns:
2346  * The amdgpu_bo_va_mapping matching for addr or NULL
2347  *
2348  */
2349 struct amdgpu_bo_va_mapping *amdgpu_vm_bo_lookup_mapping(struct amdgpu_vm *vm,
2350                                                          uint64_t addr)
2351 {
2352         return amdgpu_vm_it_iter_first(&vm->va, addr, addr);
2353 }
2354
2355 /**
2356  * amdgpu_vm_bo_trace_cs - trace all reserved mappings
2357  *
2358  * @vm: the requested vm
2359  * @ticket: CS ticket
2360  *
2361  * Trace all mappings of BOs reserved during a command submission.
2362  */
2363 void amdgpu_vm_bo_trace_cs(struct amdgpu_vm *vm, struct ww_acquire_ctx *ticket)
2364 {
2365         struct amdgpu_bo_va_mapping *mapping;
2366
2367         if (!trace_amdgpu_vm_bo_cs_enabled())
2368                 return;
2369
2370         for (mapping = amdgpu_vm_it_iter_first(&vm->va, 0, U64_MAX); mapping;
2371              mapping = amdgpu_vm_it_iter_next(mapping, 0, U64_MAX)) {
2372                 if (mapping->bo_va && mapping->bo_va->base.bo) {
2373                         struct amdgpu_bo *bo;
2374
2375                         bo = mapping->bo_va->base.bo;
2376                         if (READ_ONCE(bo->tbo.resv->lock.ctx) != ticket)
2377                                 continue;
2378                 }
2379
2380                 trace_amdgpu_vm_bo_cs(mapping);
2381         }
2382 }
2383
2384 /**
2385  * amdgpu_vm_bo_rmv - remove a bo to a specific vm
2386  *
2387  * @adev: amdgpu_device pointer
2388  * @bo_va: requested bo_va
2389  *
2390  * Remove @bo_va->bo from the requested vm.
2391  *
2392  * Object have to be reserved!
2393  */
2394 void amdgpu_vm_bo_rmv(struct amdgpu_device *adev,
2395                       struct amdgpu_bo_va *bo_va)
2396 {
2397         struct amdgpu_bo_va_mapping *mapping, *next;
2398         struct amdgpu_vm *vm = bo_va->base.vm;
2399
2400         list_del(&bo_va->base.bo_list);
2401
2402         spin_lock(&vm->moved_lock);
2403         list_del(&bo_va->base.vm_status);
2404         spin_unlock(&vm->moved_lock);
2405
2406         list_for_each_entry_safe(mapping, next, &bo_va->valids, list) {
2407                 list_del(&mapping->list);
2408                 amdgpu_vm_it_remove(mapping, &vm->va);
2409                 mapping->bo_va = NULL;
2410                 trace_amdgpu_vm_bo_unmap(bo_va, mapping);
2411                 list_add(&mapping->list, &vm->freed);
2412         }
2413         list_for_each_entry_safe(mapping, next, &bo_va->invalids, list) {
2414                 list_del(&mapping->list);
2415                 amdgpu_vm_it_remove(mapping, &vm->va);
2416                 amdgpu_vm_free_mapping(adev, vm, mapping,
2417                                        bo_va->last_pt_update);
2418         }
2419
2420         dma_fence_put(bo_va->last_pt_update);
2421         kfree(bo_va);
2422 }
2423
2424 /**
2425  * amdgpu_vm_bo_invalidate - mark the bo as invalid
2426  *
2427  * @adev: amdgpu_device pointer
2428  * @bo: amdgpu buffer object
2429  * @evicted: is the BO evicted
2430  *
2431  * Mark @bo as invalid.
2432  */
2433 void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev,
2434                              struct amdgpu_bo *bo, bool evicted)
2435 {
2436         struct amdgpu_vm_bo_base *bo_base;
2437
2438         /* shadow bo doesn't have bo base, its validation needs its parent */
2439         if (bo->parent && bo->parent->shadow == bo)
2440                 bo = bo->parent;
2441
2442         list_for_each_entry(bo_base, &bo->va, bo_list) {
2443                 struct amdgpu_vm *vm = bo_base->vm;
2444                 bool was_moved = bo_base->moved;
2445
2446                 bo_base->moved = true;
2447                 if (evicted && bo->tbo.resv == vm->root.base.bo->tbo.resv) {
2448                         if (bo->tbo.type == ttm_bo_type_kernel)
2449                                 list_move(&bo_base->vm_status, &vm->evicted);
2450                         else
2451                                 list_move_tail(&bo_base->vm_status,
2452                                                &vm->evicted);
2453                         continue;
2454                 }
2455
2456                 if (was_moved)
2457                         continue;
2458
2459                 if (bo->tbo.type == ttm_bo_type_kernel) {
2460                         list_move(&bo_base->vm_status, &vm->relocated);
2461                 } else {
2462                         spin_lock(&bo_base->vm->moved_lock);
2463                         list_move(&bo_base->vm_status, &vm->moved);
2464                         spin_unlock(&bo_base->vm->moved_lock);
2465                 }
2466         }
2467 }
2468
2469 /**
2470  * amdgpu_vm_get_block_size - calculate VM page table size as power of two
2471  *
2472  * @vm_size: VM size
2473  *
2474  * Returns:
2475  * VM page table as power of two
2476  */
2477 static uint32_t amdgpu_vm_get_block_size(uint64_t vm_size)
2478 {
2479         /* Total bits covered by PD + PTs */
2480         unsigned bits = ilog2(vm_size) + 18;
2481
2482         /* Make sure the PD is 4K in size up to 8GB address space.
2483            Above that split equal between PD and PTs */
2484         if (vm_size <= 8)
2485                 return (bits - 9);
2486         else
2487                 return ((bits + 3) / 2);
2488 }
2489
2490 /**
2491  * amdgpu_vm_adjust_size - adjust vm size, block size and fragment size
2492  *
2493  * @adev: amdgpu_device pointer
2494  * @min_vm_size: the minimum vm size in GB if it's set auto
2495  * @fragment_size_default: Default PTE fragment size
2496  * @max_level: max VMPT level
2497  * @max_bits: max address space size in bits
2498  *
2499  */
2500 void amdgpu_vm_adjust_size(struct amdgpu_device *adev, uint32_t min_vm_size,
2501                            uint32_t fragment_size_default, unsigned max_level,
2502                            unsigned max_bits)
2503 {
2504         unsigned int max_size = 1 << (max_bits - 30);
2505         unsigned int vm_size;
2506         uint64_t tmp;
2507
2508         /* adjust vm size first */
2509         if (amdgpu_vm_size != -1) {
2510                 vm_size = amdgpu_vm_size;
2511                 if (vm_size > max_size) {
2512                         dev_warn(adev->dev, "VM size (%d) too large, max is %u GB\n",
2513                                  amdgpu_vm_size, max_size);
2514                         vm_size = max_size;
2515                 }
2516         } else {
2517                 struct sysinfo si;
2518                 unsigned int phys_ram_gb;
2519
2520                 /* Optimal VM size depends on the amount of physical
2521                  * RAM available. Underlying requirements and
2522                  * assumptions:
2523                  *
2524                  *  - Need to map system memory and VRAM from all GPUs
2525                  *     - VRAM from other GPUs not known here
2526                  *     - Assume VRAM <= system memory
2527                  *  - On GFX8 and older, VM space can be segmented for
2528                  *    different MTYPEs
2529                  *  - Need to allow room for fragmentation, guard pages etc.
2530                  *
2531                  * This adds up to a rough guess of system memory x3.
2532                  * Round up to power of two to maximize the available
2533                  * VM size with the given page table size.
2534                  */
2535                 si_meminfo(&si);
2536                 phys_ram_gb = ((uint64_t)si.totalram * si.mem_unit +
2537                                (1 << 30) - 1) >> 30;
2538                 vm_size = roundup_pow_of_two(
2539                         min(max(phys_ram_gb * 3, min_vm_size), max_size));
2540         }
2541
2542         adev->vm_manager.max_pfn = (uint64_t)vm_size << 18;
2543
2544         tmp = roundup_pow_of_two(adev->vm_manager.max_pfn);
2545         if (amdgpu_vm_block_size != -1)
2546                 tmp >>= amdgpu_vm_block_size - 9;
2547         tmp = DIV_ROUND_UP(fls64(tmp) - 1, 9) - 1;
2548         adev->vm_manager.num_level = min(max_level, (unsigned)tmp);
2549         switch (adev->vm_manager.num_level) {
2550         case 3:
2551                 adev->vm_manager.root_level = AMDGPU_VM_PDB2;
2552                 break;
2553         case 2:
2554                 adev->vm_manager.root_level = AMDGPU_VM_PDB1;
2555                 break;
2556         case 1:
2557                 adev->vm_manager.root_level = AMDGPU_VM_PDB0;
2558                 break;
2559         default:
2560                 dev_err(adev->dev, "VMPT only supports 2~4+1 levels\n");
2561         }
2562         /* block size depends on vm size and hw setup*/
2563         if (amdgpu_vm_block_size != -1)
2564                 adev->vm_manager.block_size =
2565                         min((unsigned)amdgpu_vm_block_size, max_bits
2566                             - AMDGPU_GPU_PAGE_SHIFT
2567                             - 9 * adev->vm_manager.num_level);
2568         else if (adev->vm_manager.num_level > 1)
2569                 adev->vm_manager.block_size = 9;
2570         else
2571                 adev->vm_manager.block_size = amdgpu_vm_get_block_size(tmp);
2572
2573         if (amdgpu_vm_fragment_size == -1)
2574                 adev->vm_manager.fragment_size = fragment_size_default;
2575         else
2576                 adev->vm_manager.fragment_size = amdgpu_vm_fragment_size;
2577
2578         DRM_INFO("vm size is %u GB, %u levels, block size is %u-bit, fragment size is %u-bit\n",
2579                  vm_size, adev->vm_manager.num_level + 1,
2580                  adev->vm_manager.block_size,
2581                  adev->vm_manager.fragment_size);
2582 }
2583
2584 /**
2585  * amdgpu_vm_init - initialize a vm instance
2586  *
2587  * @adev: amdgpu_device pointer
2588  * @vm: requested vm
2589  * @vm_context: Indicates if it GFX or Compute context
2590  * @pasid: Process address space identifier
2591  *
2592  * Init @vm fields.
2593  *
2594  * Returns:
2595  * 0 for success, error for failure.
2596  */
2597 int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
2598                    int vm_context, unsigned int pasid)
2599 {
2600         struct amdgpu_bo_param bp;
2601         struct amdgpu_bo *root;
2602         const unsigned align = min(AMDGPU_VM_PTB_ALIGN_SIZE,
2603                 AMDGPU_VM_PTE_COUNT(adev) * 8);
2604         unsigned ring_instance;
2605         struct amdgpu_ring *ring;
2606         struct drm_sched_rq *rq;
2607         unsigned long size;
2608         uint64_t flags;
2609         int r, i;
2610
2611         vm->va = RB_ROOT_CACHED;
2612         for (i = 0; i < AMDGPU_MAX_VMHUBS; i++)
2613                 vm->reserved_vmid[i] = NULL;
2614         INIT_LIST_HEAD(&vm->evicted);
2615         INIT_LIST_HEAD(&vm->relocated);
2616         spin_lock_init(&vm->moved_lock);
2617         INIT_LIST_HEAD(&vm->moved);
2618         INIT_LIST_HEAD(&vm->idle);
2619         INIT_LIST_HEAD(&vm->freed);
2620
2621         /* create scheduler entity for page table updates */
2622
2623         ring_instance = atomic_inc_return(&adev->vm_manager.vm_pte_next_ring);
2624         ring_instance %= adev->vm_manager.vm_pte_num_rings;
2625         ring = adev->vm_manager.vm_pte_rings[ring_instance];
2626         rq = &ring->sched.sched_rq[DRM_SCHED_PRIORITY_KERNEL];
2627         r = drm_sched_entity_init(&vm->entity, &rq, 1, NULL);
2628         if (r)
2629                 return r;
2630
2631         vm->pte_support_ats = false;
2632
2633         if (vm_context == AMDGPU_VM_CONTEXT_COMPUTE) {
2634                 vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode &
2635                                                 AMDGPU_VM_USE_CPU_FOR_COMPUTE);
2636
2637                 if (adev->asic_type == CHIP_RAVEN)
2638                         vm->pte_support_ats = true;
2639         } else {
2640                 vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode &
2641                                                 AMDGPU_VM_USE_CPU_FOR_GFX);
2642         }
2643         DRM_DEBUG_DRIVER("VM update mode is %s\n",
2644                          vm->use_cpu_for_update ? "CPU" : "SDMA");
2645         WARN_ONCE((vm->use_cpu_for_update & !amdgpu_gmc_vram_full_visible(&adev->gmc)),
2646                   "CPU update of VM recommended only for large BAR system\n");
2647         vm->last_update = NULL;
2648
2649         flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
2650         if (vm->use_cpu_for_update)
2651                 flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
2652         else if (vm_context != AMDGPU_VM_CONTEXT_COMPUTE)
2653                 flags |= AMDGPU_GEM_CREATE_SHADOW;
2654
2655         size = amdgpu_vm_bo_size(adev, adev->vm_manager.root_level);
2656         memset(&bp, 0, sizeof(bp));
2657         bp.size = size;
2658         bp.byte_align = align;
2659         bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
2660         bp.flags = flags;
2661         bp.type = ttm_bo_type_kernel;
2662         bp.resv = NULL;
2663         r = amdgpu_bo_create(adev, &bp, &root);
2664         if (r)
2665                 goto error_free_sched_entity;
2666
2667         r = amdgpu_bo_reserve(root, true);
2668         if (r)
2669                 goto error_free_root;
2670
2671         r = amdgpu_vm_clear_bo(adev, vm, root,
2672                                adev->vm_manager.root_level,
2673                                vm->pte_support_ats);
2674         if (r)
2675                 goto error_unreserve;
2676
2677         amdgpu_vm_bo_base_init(&vm->root.base, vm, root);
2678         amdgpu_bo_unreserve(vm->root.base.bo);
2679
2680         if (pasid) {
2681                 unsigned long flags;
2682
2683                 spin_lock_irqsave(&adev->vm_manager.pasid_lock, flags);
2684                 r = idr_alloc(&adev->vm_manager.pasid_idr, vm, pasid, pasid + 1,
2685                               GFP_ATOMIC);
2686                 spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
2687                 if (r < 0)
2688                         goto error_free_root;
2689
2690                 vm->pasid = pasid;
2691         }
2692
2693         INIT_KFIFO(vm->faults);
2694         vm->fault_credit = 16;
2695
2696         return 0;
2697
2698 error_unreserve:
2699         amdgpu_bo_unreserve(vm->root.base.bo);
2700
2701 error_free_root:
2702         amdgpu_bo_unref(&vm->root.base.bo->shadow);
2703         amdgpu_bo_unref(&vm->root.base.bo);
2704         vm->root.base.bo = NULL;
2705
2706 error_free_sched_entity:
2707         drm_sched_entity_destroy(&vm->entity);
2708
2709         return r;
2710 }
2711
2712 /**
2713  * amdgpu_vm_make_compute - Turn a GFX VM into a compute VM
2714  *
2715  * @adev: amdgpu_device pointer
2716  * @vm: requested vm
2717  *
2718  * This only works on GFX VMs that don't have any BOs added and no
2719  * page tables allocated yet.
2720  *
2721  * Changes the following VM parameters:
2722  * - use_cpu_for_update
2723  * - pte_supports_ats
2724  * - pasid (old PASID is released, because compute manages its own PASIDs)
2725  *
2726  * Reinitializes the page directory to reflect the changed ATS
2727  * setting.
2728  *
2729  * Returns:
2730  * 0 for success, -errno for errors.
2731  */
2732 int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm)
2733 {
2734         bool pte_support_ats = (adev->asic_type == CHIP_RAVEN);
2735         int r;
2736
2737         r = amdgpu_bo_reserve(vm->root.base.bo, true);
2738         if (r)
2739                 return r;
2740
2741         /* Sanity checks */
2742         if (!RB_EMPTY_ROOT(&vm->va.rb_root) || vm->root.entries) {
2743                 r = -EINVAL;
2744                 goto error;
2745         }
2746
2747         /* Check if PD needs to be reinitialized and do it before
2748          * changing any other state, in case it fails.
2749          */
2750         if (pte_support_ats != vm->pte_support_ats) {
2751                 r = amdgpu_vm_clear_bo(adev, vm, vm->root.base.bo,
2752                                adev->vm_manager.root_level,
2753                                pte_support_ats);
2754                 if (r)
2755                         goto error;
2756         }
2757
2758         /* Update VM state */
2759         vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode &
2760                                     AMDGPU_VM_USE_CPU_FOR_COMPUTE);
2761         vm->pte_support_ats = pte_support_ats;
2762         DRM_DEBUG_DRIVER("VM update mode is %s\n",
2763                          vm->use_cpu_for_update ? "CPU" : "SDMA");
2764         WARN_ONCE((vm->use_cpu_for_update & !amdgpu_gmc_vram_full_visible(&adev->gmc)),
2765                   "CPU update of VM recommended only for large BAR system\n");
2766
2767         if (vm->pasid) {
2768                 unsigned long flags;
2769
2770                 spin_lock_irqsave(&adev->vm_manager.pasid_lock, flags);
2771                 idr_remove(&adev->vm_manager.pasid_idr, vm->pasid);
2772                 spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
2773
2774                 vm->pasid = 0;
2775         }
2776
2777         /* Free the shadow bo for compute VM */
2778         amdgpu_bo_unref(&vm->root.base.bo->shadow);
2779
2780 error:
2781         amdgpu_bo_unreserve(vm->root.base.bo);
2782         return r;
2783 }
2784
2785 /**
2786  * amdgpu_vm_free_levels - free PD/PT levels
2787  *
2788  * @adev: amdgpu device structure
2789  * @parent: PD/PT starting level to free
2790  * @level: level of parent structure
2791  *
2792  * Free the page directory or page table level and all sub levels.
2793  */
2794 static void amdgpu_vm_free_levels(struct amdgpu_device *adev,
2795                                   struct amdgpu_vm_pt *parent,
2796                                   unsigned level)
2797 {
2798         unsigned i, num_entries = amdgpu_vm_num_entries(adev, level);
2799
2800         if (parent->base.bo) {
2801                 list_del(&parent->base.bo_list);
2802                 list_del(&parent->base.vm_status);
2803                 amdgpu_bo_unref(&parent->base.bo->shadow);
2804                 amdgpu_bo_unref(&parent->base.bo);
2805         }
2806
2807         if (parent->entries)
2808                 for (i = 0; i < num_entries; i++)
2809                         amdgpu_vm_free_levels(adev, &parent->entries[i],
2810                                               level + 1);
2811
2812         kvfree(parent->entries);
2813 }
2814
2815 /**
2816  * amdgpu_vm_fini - tear down a vm instance
2817  *
2818  * @adev: amdgpu_device pointer
2819  * @vm: requested vm
2820  *
2821  * Tear down @vm.
2822  * Unbind the VM and remove all bos from the vm bo list
2823  */
2824 void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
2825 {
2826         struct amdgpu_bo_va_mapping *mapping, *tmp;
2827         bool prt_fini_needed = !!adev->gmc.gmc_funcs->set_prt;
2828         struct amdgpu_bo *root;
2829         u64 fault;
2830         int i, r;
2831
2832         amdgpu_amdkfd_gpuvm_destroy_cb(adev, vm);
2833
2834         /* Clear pending page faults from IH when the VM is destroyed */
2835         while (kfifo_get(&vm->faults, &fault))
2836                 amdgpu_ih_clear_fault(adev, fault);
2837
2838         if (vm->pasid) {
2839                 unsigned long flags;
2840
2841                 spin_lock_irqsave(&adev->vm_manager.pasid_lock, flags);
2842                 idr_remove(&adev->vm_manager.pasid_idr, vm->pasid);
2843                 spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
2844         }
2845
2846         drm_sched_entity_destroy(&vm->entity);
2847
2848         if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
2849                 dev_err(adev->dev, "still active bo inside vm\n");
2850         }
2851         rbtree_postorder_for_each_entry_safe(mapping, tmp,
2852                                              &vm->va.rb_root, rb) {
2853                 list_del(&mapping->list);
2854                 amdgpu_vm_it_remove(mapping, &vm->va);
2855                 kfree(mapping);
2856         }
2857         list_for_each_entry_safe(mapping, tmp, &vm->freed, list) {
2858                 if (mapping->flags & AMDGPU_PTE_PRT && prt_fini_needed) {
2859                         amdgpu_vm_prt_fini(adev, vm);
2860                         prt_fini_needed = false;
2861                 }
2862
2863                 list_del(&mapping->list);
2864                 amdgpu_vm_free_mapping(adev, vm, mapping, NULL);
2865         }
2866
2867         root = amdgpu_bo_ref(vm->root.base.bo);
2868         r = amdgpu_bo_reserve(root, true);
2869         if (r) {
2870                 dev_err(adev->dev, "Leaking page tables because BO reservation failed\n");
2871         } else {
2872                 amdgpu_vm_free_levels(adev, &vm->root,
2873                                       adev->vm_manager.root_level);
2874                 amdgpu_bo_unreserve(root);
2875         }
2876         amdgpu_bo_unref(&root);
2877         dma_fence_put(vm->last_update);
2878         for (i = 0; i < AMDGPU_MAX_VMHUBS; i++)
2879                 amdgpu_vmid_free_reserved(adev, vm, i);
2880 }
2881
2882 /**
2883  * amdgpu_vm_pasid_fault_credit - Check fault credit for given PASID
2884  *
2885  * @adev: amdgpu_device pointer
2886  * @pasid: PASID do identify the VM
2887  *
2888  * This function is expected to be called in interrupt context.
2889  *
2890  * Returns:
2891  * True if there was fault credit, false otherwise
2892  */
2893 bool amdgpu_vm_pasid_fault_credit(struct amdgpu_device *adev,
2894                                   unsigned int pasid)
2895 {
2896         struct amdgpu_vm *vm;
2897
2898         spin_lock(&adev->vm_manager.pasid_lock);
2899         vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
2900         if (!vm) {
2901                 /* VM not found, can't track fault credit */
2902                 spin_unlock(&adev->vm_manager.pasid_lock);
2903                 return true;
2904         }
2905
2906         /* No lock needed. only accessed by IRQ handler */
2907         if (!vm->fault_credit) {
2908                 /* Too many faults in this VM */
2909                 spin_unlock(&adev->vm_manager.pasid_lock);
2910                 return false;
2911         }
2912
2913         vm->fault_credit--;
2914         spin_unlock(&adev->vm_manager.pasid_lock);
2915         return true;
2916 }
2917
2918 /**
2919  * amdgpu_vm_manager_init - init the VM manager
2920  *
2921  * @adev: amdgpu_device pointer
2922  *
2923  * Initialize the VM manager structures
2924  */
2925 void amdgpu_vm_manager_init(struct amdgpu_device *adev)
2926 {
2927         unsigned i;
2928
2929         amdgpu_vmid_mgr_init(adev);
2930
2931         adev->vm_manager.fence_context =
2932                 dma_fence_context_alloc(AMDGPU_MAX_RINGS);
2933         for (i = 0; i < AMDGPU_MAX_RINGS; ++i)
2934                 adev->vm_manager.seqno[i] = 0;
2935
2936         atomic_set(&adev->vm_manager.vm_pte_next_ring, 0);
2937         spin_lock_init(&adev->vm_manager.prt_lock);
2938         atomic_set(&adev->vm_manager.num_prt_users, 0);
2939
2940         /* If not overridden by the user, by default, only in large BAR systems
2941          * Compute VM tables will be updated by CPU
2942          */
2943 #ifdef CONFIG_X86_64
2944         if (amdgpu_vm_update_mode == -1) {
2945                 if (amdgpu_gmc_vram_full_visible(&adev->gmc))
2946                         adev->vm_manager.vm_update_mode =
2947                                 AMDGPU_VM_USE_CPU_FOR_COMPUTE;
2948                 else
2949                         adev->vm_manager.vm_update_mode = 0;
2950         } else
2951                 adev->vm_manager.vm_update_mode = amdgpu_vm_update_mode;
2952 #else
2953         adev->vm_manager.vm_update_mode = 0;
2954 #endif
2955
2956         idr_init(&adev->vm_manager.pasid_idr);
2957         spin_lock_init(&adev->vm_manager.pasid_lock);
2958 }
2959
2960 /**
2961  * amdgpu_vm_manager_fini - cleanup VM manager
2962  *
2963  * @adev: amdgpu_device pointer
2964  *
2965  * Cleanup the VM manager and free resources.
2966  */
2967 void amdgpu_vm_manager_fini(struct amdgpu_device *adev)
2968 {
2969         WARN_ON(!idr_is_empty(&adev->vm_manager.pasid_idr));
2970         idr_destroy(&adev->vm_manager.pasid_idr);
2971
2972         amdgpu_vmid_mgr_fini(adev);
2973 }
2974
2975 /**
2976  * amdgpu_vm_ioctl - Manages VMID reservation for vm hubs.
2977  *
2978  * @dev: drm device pointer
2979  * @data: drm_amdgpu_vm
2980  * @filp: drm file pointer
2981  *
2982  * Returns:
2983  * 0 for success, -errno for errors.
2984  */
2985 int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
2986 {
2987         union drm_amdgpu_vm *args = data;
2988         struct amdgpu_device *adev = dev->dev_private;
2989         struct amdgpu_fpriv *fpriv = filp->driver_priv;
2990         int r;
2991
2992         switch (args->in.op) {
2993         case AMDGPU_VM_OP_RESERVE_VMID:
2994                 /* current, we only have requirement to reserve vmid from gfxhub */
2995                 r = amdgpu_vmid_alloc_reserved(adev, &fpriv->vm, AMDGPU_GFXHUB);
2996                 if (r)
2997                         return r;
2998                 break;
2999         case AMDGPU_VM_OP_UNRESERVE_VMID:
3000                 amdgpu_vmid_free_reserved(adev, &fpriv->vm, AMDGPU_GFXHUB);
3001                 break;
3002         default:
3003                 return -EINVAL;
3004         }
3005
3006         return 0;
3007 }
3008
3009 /**
3010  * amdgpu_vm_get_task_info - Extracts task info for a PASID.
3011  *
3012  * @dev: drm device pointer
3013  * @pasid: PASID identifier for VM
3014  * @task_info: task_info to fill.
3015  */
3016 void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid,
3017                          struct amdgpu_task_info *task_info)
3018 {
3019         struct amdgpu_vm *vm;
3020         unsigned long flags;
3021
3022         spin_lock_irqsave(&adev->vm_manager.pasid_lock, flags);
3023
3024         vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
3025         if (vm)
3026                 *task_info = vm->task_info;
3027
3028         spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
3029 }
3030
3031 /**
3032  * amdgpu_vm_set_task_info - Sets VMs task info.
3033  *
3034  * @vm: vm for which to set the info
3035  */
3036 void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
3037 {
3038         if (!vm->task_info.pid) {
3039                 vm->task_info.pid = current->pid;
3040                 get_task_comm(vm->task_info.task_name, current);
3041
3042                 if (current->group_leader->mm == current->mm) {
3043                         vm->task_info.tgid = current->group_leader->pid;
3044                         get_task_comm(vm->task_info.process_name, current->group_leader);
3045                 }
3046         }
3047 }