drivers/iommu/iommufd/vfio_compat.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
   3  */
   4 #include <linux/file.h>
   5 #include <linux/interval_tree.h>
   6 #include <linux/iommu.h>
   7 #include <linux/iommufd.h>
   8 #include <linux/slab.h>
   9 #include <linux/vfio.h>
  10 #include <uapi/linux/vfio.h>
  11 #include <uapi/linux/iommufd.h>
  12
  13 #include "iommufd_private.h"
  14
  15 static struct iommufd_ioas *get_compat_ioas(struct iommufd_ctx *ictx)
  16 {
  17         struct iommufd_ioas *ioas = ERR_PTR(-ENODEV);
  18
  19         xa_lock(&ictx->objects);
  20         if (!ictx->vfio_ioas || !iommufd_lock_obj(&ictx->vfio_ioas->obj))
  21                 goto out_unlock;
  22         ioas = ictx->vfio_ioas;
  23 out_unlock:
  24         xa_unlock(&ictx->objects);
  25         return ioas;
  26 }
  27
  28 /**
  29  * iommufd_vfio_compat_ioas_get_id - Ensure a compat IOAS exists
  30  * @ictx: Context to operate on
  31  * @out_ioas_id: The IOAS ID of the compatibility IOAS
  32  *
  33  * Return the ID of the current compatibility IOAS. The ID can be passed into
  34  * other functions that take an ioas_id.
  35  */
  36 int iommufd_vfio_compat_ioas_get_id(struct iommufd_ctx *ictx, u32 *out_ioas_id)
  37 {
  38         struct iommufd_ioas *ioas;
  39
  40         ioas = get_compat_ioas(ictx);
  41         if (IS_ERR(ioas))
  42                 return PTR_ERR(ioas);
  43         *out_ioas_id = ioas->obj.id;
  44         iommufd_put_object(ictx, &ioas->obj);
  45         return 0;
  46 }
  47 EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_get_id, IOMMUFD_VFIO);
  48
  49 /**
  50  * iommufd_vfio_compat_set_no_iommu - Called when a no-iommu device is attached
  51  * @ictx: Context to operate on
  52  *
  53  * This allows selecting the VFIO_NOIOMMU_IOMMU and blocks normal types.
  54  */
  55 int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx)
  56 {
  57         int ret;
  58
  59         xa_lock(&ictx->objects);
  60         if (!ictx->vfio_ioas) {
  61                 ictx->no_iommu_mode = 1;
  62                 ret = 0;
  63         } else {
  64                 ret = -EINVAL;
  65         }
  66         xa_unlock(&ictx->objects);
  67         return ret;
  68 }
  69 EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_set_no_iommu, IOMMUFD_VFIO);
  70
  71 /**
  72  * iommufd_vfio_compat_ioas_create - Ensure the compat IOAS is created
  73  * @ictx: Context to operate on
  74  *
  75  * The compatibility IOAS is the IOAS that the vfio compatibility ioctls operate
  76  * on since they do not have an IOAS ID input in their ABI. Only attaching a
  77  * group should cause a default creation of the internal ioas, this does nothing
  78  * if an existing ioas has already been assigned somehow.
  79  */
  80 int iommufd_vfio_compat_ioas_create(struct iommufd_ctx *ictx)
  81 {
  82         struct iommufd_ioas *ioas = NULL;
  83         int ret;
  84
  85         ioas = iommufd_ioas_alloc(ictx);
  86         if (IS_ERR(ioas))
  87                 return PTR_ERR(ioas);
  88
  89         xa_lock(&ictx->objects);
  90         /*
  91          * VFIO won't allow attaching a container to both iommu and no iommu
  92          * operation
  93          */
  94         if (ictx->no_iommu_mode) {
  95                 ret = -EINVAL;
  96                 goto out_abort;
  97         }
  98
  99         if (ictx->vfio_ioas && iommufd_lock_obj(&ictx->vfio_ioas->obj)) {
 100                 ret = 0;
 101                 iommufd_put_object(ictx, &ictx->vfio_ioas->obj);
 102                 goto out_abort;
 103         }
 104         ictx->vfio_ioas = ioas;
 105         xa_unlock(&ictx->objects);
 106
 107         /*
 108          * An automatically created compat IOAS is treated as a userspace
 109          * created object. Userspace can learn the ID via IOMMU_VFIO_IOAS_GET,
 110          * and if not manually destroyed it will be destroyed automatically
 111          * at iommufd release.
 112          */
 113         iommufd_object_finalize(ictx, &ioas->obj);
 114         return 0;
 115
 116 out_abort:
 117         xa_unlock(&ictx->objects);
 118         iommufd_object_abort(ictx, &ioas->obj);
 119         return ret;
 120 }
 121 EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_create, IOMMUFD_VFIO);
 122
 123 int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd)
 124 {
 125         struct iommu_vfio_ioas *cmd = ucmd->cmd;
 126         struct iommufd_ioas *ioas;
 127
 128         if (cmd->__reserved)
 129                 return -EOPNOTSUPP;
 130         switch (cmd->op) {
 131         case IOMMU_VFIO_IOAS_GET:
 132                 ioas = get_compat_ioas(ucmd->ictx);
 133                 if (IS_ERR(ioas))
 134                         return PTR_ERR(ioas);
 135                 cmd->ioas_id = ioas->obj.id;
 136                 iommufd_put_object(ucmd->ictx, &ioas->obj);
 137                 return iommufd_ucmd_respond(ucmd, sizeof(*cmd));
 138
 139         case IOMMU_VFIO_IOAS_SET:
 140                 ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id);
 141                 if (IS_ERR(ioas))
 142                         return PTR_ERR(ioas);
 143                 xa_lock(&ucmd->ictx->objects);
 144                 ucmd->ictx->vfio_ioas = ioas;
 145                 xa_unlock(&ucmd->ictx->objects);
 146                 iommufd_put_object(ucmd->ictx, &ioas->obj);
 147                 return 0;
 148
 149         case IOMMU_VFIO_IOAS_CLEAR:
 150                 xa_lock(&ucmd->ictx->objects);
 151                 ucmd->ictx->vfio_ioas = NULL;
 152                 xa_unlock(&ucmd->ictx->objects);
 153                 return 0;
 154         default:
 155                 return -EOPNOTSUPP;
 156         }
 157 }
 158
 159 static int iommufd_vfio_map_dma(struct iommufd_ctx *ictx, unsigned int cmd,
 160                                 void __user *arg)
 161 {
 162         u32 supported_flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
 163         size_t minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
 164         struct vfio_iommu_type1_dma_map map;
 165         int iommu_prot = IOMMU_CACHE;
 166         struct iommufd_ioas *ioas;
 167         unsigned long iova;
 168         int rc;
 169
 170         if (copy_from_user(&map, arg, minsz))
 171                 return -EFAULT;
 172
 173         if (map.argsz < minsz || map.flags & ~supported_flags)
 174                 return -EINVAL;
 175
 176         if (map.flags & VFIO_DMA_MAP_FLAG_READ)
 177                 iommu_prot |= IOMMU_READ;
 178         if (map.flags & VFIO_DMA_MAP_FLAG_WRITE)
 179                 iommu_prot |= IOMMU_WRITE;
 180
 181         ioas = get_compat_ioas(ictx);
 182         if (IS_ERR(ioas))
 183                 return PTR_ERR(ioas);
 184
 185         /*
 186          * Maps created through the legacy interface always use VFIO compatible
 187          * rlimit accounting. If the user wishes to use the faster user based
 188          * rlimit accounting then they must use the new interface.
 189          */
 190         iova = map.iova;
 191         rc = iopt_map_user_pages(ictx, &ioas->iopt, &iova, u64_to_user_ptr(map.vaddr),
 192                                  map.size, iommu_prot, 0);
 193         iommufd_put_object(ictx, &ioas->obj);
 194         return rc;
 195 }
 196
 197 static int iommufd_vfio_unmap_dma(struct iommufd_ctx *ictx, unsigned int cmd,
 198                                   void __user *arg)
 199 {
 200         size_t minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
 201         /*
 202          * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP is obsoleted by the new
 203          * dirty tracking direction:
 204          *  https://lore.kernel.org/kvm/20220731125503.142683-1-yishaih@nvidia.com/
 205          *  https://lore.kernel.org/kvm/20220428210933.3583-1-joao.m.martins@oracle.com/
 206          */
 207         u32 supported_flags = VFIO_DMA_UNMAP_FLAG_ALL;
 208         struct vfio_iommu_type1_dma_unmap unmap;
 209         unsigned long unmapped = 0;
 210         struct iommufd_ioas *ioas;
 211         int rc;
 212
 213         if (copy_from_user(&unmap, arg, minsz))
 214                 return -EFAULT;
 215
 216         if (unmap.argsz < minsz || unmap.flags & ~supported_flags)
 217                 return -EINVAL;
 218
 219         ioas = get_compat_ioas(ictx);
 220         if (IS_ERR(ioas))
 221                 return PTR_ERR(ioas);
 222
 223         if (unmap.flags & VFIO_DMA_UNMAP_FLAG_ALL) {
 224                 if (unmap.iova != 0 || unmap.size != 0) {
 225                         rc = -EINVAL;
 226                         goto err_put;
 227                 }
 228                 rc = iopt_unmap_all(&ioas->iopt, &unmapped);
 229         } else {
 230                 if (READ_ONCE(ioas->iopt.disable_large_pages)) {
 231                         /*
 232                          * Create cuts at the start and last of the requested
 233                          * range. If the start IOVA is 0 then it doesn't need to
 234                          * be cut.
 235                          */
 236                         unsigned long iovas[] = { unmap.iova + unmap.size - 1,
 237                                                   unmap.iova - 1 };
 238
 239                         rc = iopt_cut_iova(&ioas->iopt, iovas,
 240                                            unmap.iova ? 2 : 1);
 241                         if (rc)
 242                                 goto err_put;
 243                 }
 244                 rc = iopt_unmap_iova(&ioas->iopt, unmap.iova, unmap.size,
 245                                      &unmapped);
 246         }
 247         unmap.size = unmapped;
 248         if (copy_to_user(arg, &unmap, minsz))
 249                 rc = -EFAULT;
 250
 251 err_put:
 252         iommufd_put_object(ictx, &ioas->obj);
 253         return rc;
 254 }
 255
 256 static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx)
 257 {
 258         struct iommufd_hwpt_paging *hwpt_paging;
 259         struct iommufd_ioas *ioas;
 260         int rc = 1;
 261
 262         ioas = get_compat_ioas(ictx);
 263         if (IS_ERR(ioas))
 264                 return PTR_ERR(ioas);
 265
 266         mutex_lock(&ioas->mutex);
 267         list_for_each_entry(hwpt_paging, &ioas->hwpt_list, hwpt_item) {
 268                 if (!hwpt_paging->enforce_cache_coherency) {
 269                         rc = 0;
 270                         break;
 271                 }
 272         }
 273         mutex_unlock(&ioas->mutex);
 274
 275         iommufd_put_object(ictx, &ioas->obj);
 276         return rc;
 277 }
 278
 279 static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx,
 280                                         unsigned long type)
 281 {
 282         switch (type) {
 283         case VFIO_TYPE1_IOMMU:
 284         case VFIO_TYPE1v2_IOMMU:
 285         case VFIO_UNMAP_ALL:
 286                 return 1;
 287
 288         case VFIO_NOIOMMU_IOMMU:
 289                 return IS_ENABLED(CONFIG_VFIO_NOIOMMU);
 290
 291         case VFIO_DMA_CC_IOMMU:
 292                 return iommufd_vfio_cc_iommu(ictx);
 293
 294         /*
 295          * This is obsolete, and to be removed from VFIO. It was an incomplete
 296          * idea that got merged.
 297          * https://lore.kernel.org/kvm/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/
 298          */
 299         case VFIO_TYPE1_NESTING_IOMMU:
 300                 return 0;
 301
 302         /*
 303          * VFIO_DMA_MAP_FLAG_VADDR
 304          * https://lore.kernel.org/kvm/1611939252-7240-1-git-send-email-steven.sistare@oracle.com/
 305          * https://lore.kernel.org/all/Yz777bJZjTyLrHEQ@nvidia.com/
 306          *
 307          * It is hard to see how this could be implemented safely.
 308          */
 309         case VFIO_UPDATE_VADDR:
 310         default:
 311                 return 0;
 312         }
 313 }
 314
 315 static int iommufd_vfio_set_iommu(struct iommufd_ctx *ictx, unsigned long type)
 316 {
 317         bool no_iommu_mode = READ_ONCE(ictx->no_iommu_mode);
 318         struct iommufd_ioas *ioas = NULL;
 319         int rc = 0;
 320
 321         /*
 322          * Emulation for NOIOMMU is imperfect in that VFIO blocks almost all
 323          * other ioctls. We let them keep working but they mostly fail since no
 324          * IOAS should exist.
 325          */
 326         if (IS_ENABLED(CONFIG_VFIO_NOIOMMU) && type == VFIO_NOIOMMU_IOMMU &&
 327             no_iommu_mode) {
 328                 if (!capable(CAP_SYS_RAWIO))
 329                         return -EPERM;
 330                 return 0;
 331         }
 332
 333         if ((type != VFIO_TYPE1_IOMMU && type != VFIO_TYPE1v2_IOMMU) ||
 334             no_iommu_mode)
 335                 return -EINVAL;
 336
 337         /* VFIO fails the set_iommu if there is no group */
 338         ioas = get_compat_ioas(ictx);
 339         if (IS_ERR(ioas))
 340                 return PTR_ERR(ioas);
 341
 342         /*
 343          * The difference between TYPE1 and TYPE1v2 is the ability to unmap in
 344          * the middle of mapped ranges. This is complicated by huge page support
 345          * which creates single large IOPTEs that cannot be split by the iommu
 346          * driver. TYPE1 is very old at this point and likely nothing uses it,
 347          * however it is simple enough to emulate by simply disabling the
 348          * problematic large IOPTEs. Then we can safely unmap within any range.
 349          */
 350         if (type == VFIO_TYPE1_IOMMU)
 351                 rc = iopt_disable_large_pages(&ioas->iopt);
 352         iommufd_put_object(ictx, &ioas->obj);
 353         return rc;
 354 }
 355
 356 static unsigned long iommufd_get_pagesizes(struct iommufd_ioas *ioas)
 357 {
 358         struct io_pagetable *iopt = &ioas->iopt;
 359         unsigned long pgsize_bitmap = ULONG_MAX;
 360         struct iommu_domain *domain;
 361         unsigned long index;
 362
 363         down_read(&iopt->domains_rwsem);
 364         xa_for_each(&iopt->domains, index, domain)
 365                 pgsize_bitmap &= domain->pgsize_bitmap;
 366
 367         /* See vfio_update_pgsize_bitmap() */
 368         if (pgsize_bitmap & ~PAGE_MASK) {
 369                 pgsize_bitmap &= PAGE_MASK;
 370                 pgsize_bitmap |= PAGE_SIZE;
 371         }
 372         pgsize_bitmap = max(pgsize_bitmap, ioas->iopt.iova_alignment);
 373         up_read(&iopt->domains_rwsem);
 374         return pgsize_bitmap;
 375 }
 376
 377 static int iommufd_fill_cap_iova(struct iommufd_ioas *ioas,
 378                                  struct vfio_info_cap_header __user *cur,
 379                                  size_t avail)
 380 {
 381         struct vfio_iommu_type1_info_cap_iova_range __user *ucap_iovas =
 382                 container_of(cur,
 383                              struct vfio_iommu_type1_info_cap_iova_range __user,
 384                              header);
 385         struct vfio_iommu_type1_info_cap_iova_range cap_iovas = {
 386                 .header = {
 387                         .id = VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE,
 388                         .version = 1,
 389                 },
 390         };
 391         struct interval_tree_span_iter span;
 392
 393         interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0,
 394                                     ULONG_MAX) {
 395                 struct vfio_iova_range range;
 396
 397                 if (!span.is_hole)
 398                         continue;
 399                 range.start = span.start_hole;
 400                 range.end = span.last_hole;
 401                 if (avail >= struct_size(&cap_iovas, iova_ranges,
 402                                          cap_iovas.nr_iovas + 1) &&
 403                     copy_to_user(&ucap_iovas->iova_ranges[cap_iovas.nr_iovas],
 404                                  &range, sizeof(range)))
 405                         return -EFAULT;
 406                 cap_iovas.nr_iovas++;
 407         }
 408         if (avail >= struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas) &&
 409             copy_to_user(ucap_iovas, &cap_iovas, sizeof(cap_iovas)))
 410                 return -EFAULT;
 411         return struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas);
 412 }
 413
 414 static int iommufd_fill_cap_dma_avail(struct iommufd_ioas *ioas,
 415                                       struct vfio_info_cap_header __user *cur,
 416                                       size_t avail)
 417 {
 418         struct vfio_iommu_type1_info_dma_avail cap_dma = {
 419                 .header = {
 420                         .id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL,
 421                         .version = 1,
 422                 },
 423                 /*
 424                  * iommufd's limit is based on the cgroup's memory limit.
 425                  * Normally vfio would return U16_MAX here, and provide a module
 426                  * parameter to adjust it. Since S390 qemu userspace actually
 427                  * pays attention and needs a value bigger than U16_MAX return
 428                  * U32_MAX.
 429                  */
 430                 .avail = U32_MAX,
 431         };
 432
 433         if (avail >= sizeof(cap_dma) &&
 434             copy_to_user(cur, &cap_dma, sizeof(cap_dma)))
 435                 return -EFAULT;
 436         return sizeof(cap_dma);
 437 }
 438
 439 static int iommufd_vfio_iommu_get_info(struct iommufd_ctx *ictx,
 440                                        void __user *arg)
 441 {
 442         typedef int (*fill_cap_fn)(struct iommufd_ioas *ioas,
 443                                    struct vfio_info_cap_header __user *cur,
 444                                    size_t avail);
 445         static const fill_cap_fn fill_fns[] = {
 446                 iommufd_fill_cap_dma_avail,
 447                 iommufd_fill_cap_iova,
 448         };
 449         size_t minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
 450         struct vfio_info_cap_header __user *last_cap = NULL;
 451         struct vfio_iommu_type1_info info = {};
 452         struct iommufd_ioas *ioas;
 453         size_t total_cap_size;
 454         int rc;
 455         int i;
 456
 457         if (copy_from_user(&info, arg, minsz))
 458                 return -EFAULT;
 459
 460         if (info.argsz < minsz)
 461                 return -EINVAL;
 462         minsz = min_t(size_t, info.argsz, sizeof(info));
 463
 464         ioas = get_compat_ioas(ictx);
 465         if (IS_ERR(ioas))
 466                 return PTR_ERR(ioas);
 467
 468         info.flags = VFIO_IOMMU_INFO_PGSIZES;
 469         info.iova_pgsizes = iommufd_get_pagesizes(ioas);
 470         info.cap_offset = 0;
 471
 472         down_read(&ioas->iopt.iova_rwsem);
 473         total_cap_size = sizeof(info);
 474         for (i = 0; i != ARRAY_SIZE(fill_fns); i++) {
 475                 int cap_size;
 476
 477                 if (info.argsz > total_cap_size)
 478                         cap_size = fill_fns[i](ioas, arg + total_cap_size,
 479                                                info.argsz - total_cap_size);
 480                 else
 481                         cap_size = fill_fns[i](ioas, NULL, 0);
 482                 if (cap_size < 0) {
 483                         rc = cap_size;
 484                         goto out_put;
 485                 }
 486                 cap_size = ALIGN(cap_size, sizeof(u64));
 487
 488                 if (last_cap && info.argsz >= total_cap_size &&
 489                     put_user(total_cap_size, &last_cap->next)) {
 490                         rc = -EFAULT;
 491                         goto out_put;
 492                 }
 493                 last_cap = arg + total_cap_size;
 494                 total_cap_size += cap_size;
 495         }
 496
 497         /*
 498          * If the user did not provide enough space then only some caps are
 499          * returned and the argsz will be updated to the correct amount to get
 500          * all caps.
 501          */
 502         if (info.argsz >= total_cap_size)
 503                 info.cap_offset = sizeof(info);
 504         info.argsz = total_cap_size;
 505         info.flags |= VFIO_IOMMU_INFO_CAPS;
 506         if (copy_to_user(arg, &info, minsz)) {
 507                 rc = -EFAULT;
 508                 goto out_put;
 509         }
 510         rc = 0;
 511
 512 out_put:
 513         up_read(&ioas->iopt.iova_rwsem);
 514         iommufd_put_object(ictx, &ioas->obj);
 515         return rc;
 516 }
 517
 518 int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd,
 519                        unsigned long arg)
 520 {
 521         void __user *uarg = (void __user *)arg;
 522
 523         switch (cmd) {
 524         case VFIO_GET_API_VERSION:
 525                 return VFIO_API_VERSION;
 526         case VFIO_SET_IOMMU:
 527                 return iommufd_vfio_set_iommu(ictx, arg);
 528         case VFIO_CHECK_EXTENSION:
 529                 return iommufd_vfio_check_extension(ictx, arg);
 530         case VFIO_IOMMU_GET_INFO:
 531                 return iommufd_vfio_iommu_get_info(ictx, uarg);
 532         case VFIO_IOMMU_MAP_DMA:
 533                 return iommufd_vfio_map_dma(ictx, cmd, uarg);
 534         case VFIO_IOMMU_UNMAP_DMA:
 535                 return iommufd_vfio_unmap_dma(ictx, cmd, uarg);
 536         case VFIO_IOMMU_DIRTY_PAGES:
 537         default:
 538                 return -ENOIOCTLCMD;
 539         }
 540         return -ENOIOCTLCMD;
 541 }