libavutil/hwcontext_cuda.c

   1 /*
   2  * This file is part of FFmpeg.
   3  *
   4  * FFmpeg is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 2.1 of the License, or (at your option) any later version.
   8  *
   9  * FFmpeg is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with FFmpeg; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  17  */
  18
  19 #include "buffer.h"
  20 #include "common.h"
  21 #include "hwcontext.h"
  22 #include "hwcontext_internal.h"
  23 #include "hwcontext_cuda_internal.h"
  24 #if CONFIG_VULKAN
  25 #include "hwcontext_vulkan.h"
  26 #endif
  27 #include "cuda_check.h"
  28 #include "mem.h"
  29 #include "pixdesc.h"
  30 #include "pixfmt.h"
  31 #include "imgutils.h"
  32
  33 typedef struct CUDAFramesContext {
  34     int shift_width, shift_height;
  35     int tex_alignment;
  36 } CUDAFramesContext;
  37
  38 typedef struct CUDADeviceContext {
  39     AVCUDADeviceContext p;
  40     AVCUDADeviceContextInternal internal;
  41 } CUDADeviceContext;
  42
  43 static const enum AVPixelFormat supported_formats[] = {
  44     AV_PIX_FMT_NV12,
  45     AV_PIX_FMT_NV16,
  46     AV_PIX_FMT_YUV420P,
  47     AV_PIX_FMT_YUVA420P,
  48     AV_PIX_FMT_YUV444P,
  49     AV_PIX_FMT_P010,
  50     AV_PIX_FMT_P016,
  51     AV_PIX_FMT_P216LE,
  52     AV_PIX_FMT_YUV444P16,
  53     AV_PIX_FMT_0RGB32,
  54     AV_PIX_FMT_0BGR32,
  55     AV_PIX_FMT_RGB32,
  56     AV_PIX_FMT_BGR32,
  57 #if CONFIG_VULKAN
  58     AV_PIX_FMT_VULKAN,
  59 #endif
  60 };
  61
  62 #define CHECK_CU(x) FF_CUDA_CHECK_DL(device_ctx, cu, x)
  63
  64 static int cuda_frames_get_constraints(AVHWDeviceContext *ctx,
  65                                        const void *hwconfig,
  66                                        AVHWFramesConstraints *constraints)
  67 {
  68     int i;
  69
  70     constraints->valid_sw_formats = av_malloc_array(FF_ARRAY_ELEMS(supported_formats) + 1,
  71                                                     sizeof(*constraints->valid_sw_formats));
  72     if (!constraints->valid_sw_formats)
  73         return AVERROR(ENOMEM);
  74
  75     for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
  76         constraints->valid_sw_formats[i] = supported_formats[i];
  77     constraints->valid_sw_formats[FF_ARRAY_ELEMS(supported_formats)] = AV_PIX_FMT_NONE;
  78
  79     constraints->valid_hw_formats = av_malloc_array(2, sizeof(*constraints->valid_hw_formats));
  80     if (!constraints->valid_hw_formats)
  81         return AVERROR(ENOMEM);
  82
  83     constraints->valid_hw_formats[0] = AV_PIX_FMT_CUDA;
  84     constraints->valid_hw_formats[1] = AV_PIX_FMT_NONE;
  85
  86     return 0;
  87 }
  88
  89 static void cuda_buffer_free(void *opaque, uint8_t *data)
  90 {
  91     AVHWFramesContext        *ctx = opaque;
  92     AVHWDeviceContext *device_ctx = ctx->device_ctx;
  93     AVCUDADeviceContext    *hwctx = device_ctx->hwctx;
  94     CudaFunctions             *cu = hwctx->internal->cuda_dl;
  95
  96     CUcontext dummy;
  97
  98     CHECK_CU(cu->cuCtxPushCurrent(hwctx->cuda_ctx));
  99
 100     CHECK_CU(cu->cuMemFree((CUdeviceptr)data));
 101
 102     CHECK_CU(cu->cuCtxPopCurrent(&dummy));
 103 }
 104
 105 static AVBufferRef *cuda_pool_alloc(void *opaque, size_t size)
 106 {
 107     AVHWFramesContext        *ctx = opaque;
 108     AVHWDeviceContext *device_ctx = ctx->device_ctx;
 109     AVCUDADeviceContext    *hwctx = device_ctx->hwctx;
 110     CudaFunctions             *cu = hwctx->internal->cuda_dl;
 111
 112     AVBufferRef *ret = NULL;
 113     CUcontext dummy = NULL;
 114     CUdeviceptr data;
 115     int err;
 116
 117     err = CHECK_CU(cu->cuCtxPushCurrent(hwctx->cuda_ctx));
 118     if (err < 0)
 119         return NULL;
 120
 121     err = CHECK_CU(cu->cuMemAlloc(&data, size));
 122     if (err < 0)
 123         goto fail;
 124
 125     ret = av_buffer_create((uint8_t*)data, size, cuda_buffer_free, ctx, 0);
 126     if (!ret) {
 127         CHECK_CU(cu->cuMemFree(data));
 128         goto fail;
 129     }
 130
 131 fail:
 132     CHECK_CU(cu->cuCtxPopCurrent(&dummy));
 133     return ret;
 134 }
 135
 136 static int cuda_frames_init(AVHWFramesContext *ctx)
 137 {
 138     AVHWDeviceContext *device_ctx = ctx->device_ctx;
 139     AVCUDADeviceContext    *hwctx = device_ctx->hwctx;
 140     CUDAFramesContext       *priv = ctx->hwctx;
 141     CudaFunctions             *cu = hwctx->internal->cuda_dl;
 142     int err, i;
 143
 144     for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) {
 145         if (ctx->sw_format == supported_formats[i])
 146             break;
 147     }
 148     if (i == FF_ARRAY_ELEMS(supported_formats)) {
 149         av_log(ctx, AV_LOG_ERROR, "Pixel format '%s' is not supported\n",
 150                av_get_pix_fmt_name(ctx->sw_format));
 151         return AVERROR(ENOSYS);
 152     }
 153
 154     err = CHECK_CU(cu->cuDeviceGetAttribute(&priv->tex_alignment,
 155                                             14 /* CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT */,
 156                                             hwctx->internal->cuda_device));
 157     if (err < 0)
 158         return err;
 159
 160     av_log(ctx, AV_LOG_DEBUG, "CUDA texture alignment: %d\n", priv->tex_alignment);
 161
 162     // YUV420P is a special case.
 163     // Since nvenc expects the U/V planes to have half the linesize of the Y plane
 164     // alignment has to be doubled to ensure the U/V planes still end up aligned.
 165     if (ctx->sw_format == AV_PIX_FMT_YUV420P)
 166         priv->tex_alignment *= 2;
 167
 168     av_pix_fmt_get_chroma_sub_sample(ctx->sw_format, &priv->shift_width, &priv->shift_height);
 169
 170     if (!ctx->pool) {
 171         int size = av_image_get_buffer_size(ctx->sw_format, ctx->width, ctx->height, priv->tex_alignment);
 172         if (size < 0)
 173             return size;
 174
 175         ffhwframesctx(ctx)->pool_internal =
 176             av_buffer_pool_init2(size, ctx, cuda_pool_alloc, NULL);
 177         if (!ffhwframesctx(ctx)->pool_internal)
 178             return AVERROR(ENOMEM);
 179     }
 180
 181     return 0;
 182 }
 183
 184 static int cuda_get_buffer(AVHWFramesContext *ctx, AVFrame *frame)
 185 {
 186     CUDAFramesContext *priv = ctx->hwctx;
 187     int res;
 188
 189     frame->buf[0] = av_buffer_pool_get(ctx->pool);
 190     if (!frame->buf[0])
 191         return AVERROR(ENOMEM);
 192
 193     res = av_image_fill_arrays(frame->data, frame->linesize, frame->buf[0]->data,
 194                                ctx->sw_format, ctx->width, ctx->height, priv->tex_alignment);
 195     if (res < 0)
 196         return res;
 197
 198     // YUV420P is a special case.
 199     // Nvenc expects the U/V planes in swapped order from how ffmpeg expects them, also chroma is half-aligned
 200     if (ctx->sw_format == AV_PIX_FMT_YUV420P) {
 201         frame->linesize[1] = frame->linesize[2] = frame->linesize[0] / 2;
 202         frame->data[2]     = frame->data[1];
 203         frame->data[1]     = frame->data[2] + frame->linesize[2] * (ctx->height / 2);
 204     }
 205
 206     frame->format = AV_PIX_FMT_CUDA;
 207     frame->width  = ctx->width;
 208     frame->height = ctx->height;
 209
 210     return 0;
 211 }
 212
 213 static int cuda_transfer_get_formats(AVHWFramesContext *ctx,
 214                                      enum AVHWFrameTransferDirection dir,
 215                                      enum AVPixelFormat **formats)
 216 {
 217     enum AVPixelFormat *fmts;
 218
 219     fmts = av_malloc_array(2, sizeof(*fmts));
 220     if (!fmts)
 221         return AVERROR(ENOMEM);
 222
 223     fmts[0] = ctx->sw_format;
 224     fmts[1] = AV_PIX_FMT_NONE;
 225
 226     *formats = fmts;
 227
 228     return 0;
 229 }
 230
 231 static int cuda_transfer_data(AVHWFramesContext *ctx, AVFrame *dst,
 232                                  const AVFrame *src)
 233 {
 234     CUDAFramesContext       *priv = ctx->hwctx;
 235     AVHWDeviceContext *device_ctx = ctx->device_ctx;
 236     AVCUDADeviceContext    *hwctx = device_ctx->hwctx;
 237     CudaFunctions             *cu = hwctx->internal->cuda_dl;
 238
 239     CUcontext dummy;
 240     int i, ret;
 241
 242     if ((src->hw_frames_ctx && ((AVHWFramesContext*)src->hw_frames_ctx->data)->format != AV_PIX_FMT_CUDA) ||
 243         (dst->hw_frames_ctx && ((AVHWFramesContext*)dst->hw_frames_ctx->data)->format != AV_PIX_FMT_CUDA))
 244         return AVERROR(ENOSYS);
 245
 246     ret = CHECK_CU(cu->cuCtxPushCurrent(hwctx->cuda_ctx));
 247     if (ret < 0)
 248         return ret;
 249
 250     for (i = 0; i < FF_ARRAY_ELEMS(src->data) && src->data[i]; i++) {
 251         CUDA_MEMCPY2D cpy = {
 252             .srcPitch      = src->linesize[i],
 253             .dstPitch      = dst->linesize[i],
 254             .WidthInBytes  = FFMIN(src->linesize[i], dst->linesize[i]),
 255             .Height        = src->height >> ((i == 0 || i == 3) ? 0 : priv->shift_height),
 256         };
 257
 258         if (src->hw_frames_ctx) {
 259             cpy.srcMemoryType = CU_MEMORYTYPE_DEVICE;
 260             cpy.srcDevice     = (CUdeviceptr)src->data[i];
 261         } else {
 262             cpy.srcMemoryType = CU_MEMORYTYPE_HOST;
 263             cpy.srcHost       = src->data[i];
 264         }
 265
 266         if (dst->hw_frames_ctx) {
 267             cpy.dstMemoryType = CU_MEMORYTYPE_DEVICE;
 268             cpy.dstDevice     = (CUdeviceptr)dst->data[i];
 269         } else {
 270             cpy.dstMemoryType = CU_MEMORYTYPE_HOST;
 271             cpy.dstHost       = dst->data[i];
 272         }
 273
 274         ret = CHECK_CU(cu->cuMemcpy2DAsync(&cpy, hwctx->stream));
 275         if (ret < 0)
 276             goto exit;
 277     }
 278
 279     if (!dst->hw_frames_ctx) {
 280         ret = CHECK_CU(cu->cuStreamSynchronize(hwctx->stream));
 281         if (ret < 0)
 282             goto exit;
 283     }
 284
 285 exit:
 286     CHECK_CU(cu->cuCtxPopCurrent(&dummy));
 287
 288     return 0;
 289 }
 290
 291 static void cuda_device_uninit(AVHWDeviceContext *device_ctx)
 292 {
 293     CUDADeviceContext *hwctx = device_ctx->hwctx;
 294
 295     if (hwctx->p.internal) {
 296         CudaFunctions *cu = hwctx->internal.cuda_dl;
 297
 298         if (hwctx->internal.is_allocated && hwctx->p.cuda_ctx) {
 299             if (hwctx->internal.flags & AV_CUDA_USE_PRIMARY_CONTEXT)
 300                 CHECK_CU(cu->cuDevicePrimaryCtxRelease(hwctx->internal.cuda_device));
 301             else if (!(hwctx->internal.flags & AV_CUDA_USE_CURRENT_CONTEXT))
 302                 CHECK_CU(cu->cuCtxDestroy(hwctx->p.cuda_ctx));
 303
 304             hwctx->p.cuda_ctx = NULL;
 305         }
 306
 307         cuda_free_functions(&hwctx->internal.cuda_dl);
 308         memset(&hwctx->internal, 0, sizeof(hwctx->internal));
 309         hwctx->p.internal = NULL;
 310     }
 311 }
 312
 313 static int cuda_device_init(AVHWDeviceContext *ctx)
 314 {
 315     CUDADeviceContext *hwctx = ctx->hwctx;
 316     int ret;
 317
 318     hwctx->p.internal = &hwctx->internal;
 319
 320     if (!hwctx->internal.cuda_dl) {
 321         ret = cuda_load_functions(&hwctx->internal.cuda_dl, ctx);
 322         if (ret < 0) {
 323             av_log(ctx, AV_LOG_ERROR, "Could not dynamically load CUDA\n");
 324             goto error;
 325         }
 326     }
 327
 328     return 0;
 329
 330 error:
 331     cuda_device_uninit(ctx);
 332     return ret;
 333 }
 334
 335 static int cuda_context_init(AVHWDeviceContext *device_ctx, int flags) {
 336     AVCUDADeviceContext *hwctx = device_ctx->hwctx;
 337     CudaFunctions *cu;
 338     CUcontext dummy;
 339     int ret, dev_active = 0;
 340     unsigned int dev_flags = 0;
 341
 342     const unsigned int desired_flags = CU_CTX_SCHED_BLOCKING_SYNC;
 343
 344     cu = hwctx->internal->cuda_dl;
 345
 346     hwctx->internal->flags = flags;
 347
 348     if (flags & AV_CUDA_USE_PRIMARY_CONTEXT) {
 349         ret = CHECK_CU(cu->cuDevicePrimaryCtxGetState(hwctx->internal->cuda_device,
 350                        &dev_flags, &dev_active));
 351         if (ret < 0)
 352             return ret;
 353
 354         if (dev_active && dev_flags != desired_flags) {
 355             av_log(device_ctx, AV_LOG_ERROR, "Primary context already active with incompatible flags.\n");
 356             return AVERROR(ENOTSUP);
 357         } else if (dev_flags != desired_flags) {
 358             ret = CHECK_CU(cu->cuDevicePrimaryCtxSetFlags(hwctx->internal->cuda_device,
 359                            desired_flags));
 360             if (ret < 0)
 361                 return ret;
 362         }
 363
 364         ret = CHECK_CU(cu->cuDevicePrimaryCtxRetain(&hwctx->cuda_ctx,
 365                                                     hwctx->internal->cuda_device));
 366         if (ret < 0)
 367             return ret;
 368     } else if (flags & AV_CUDA_USE_CURRENT_CONTEXT) {
 369         ret = CHECK_CU(cu->cuCtxGetCurrent(&hwctx->cuda_ctx));
 370         if (ret < 0)
 371             return ret;
 372         av_log(device_ctx, AV_LOG_INFO, "Using current CUDA context.\n");
 373     } else {
 374         ret = CHECK_CU(cu->cuCtxCreate(&hwctx->cuda_ctx, desired_flags,
 375                                        hwctx->internal->cuda_device));
 376         if (ret < 0)
 377             return ret;
 378
 379         CHECK_CU(cu->cuCtxPopCurrent(&dummy));
 380     }
 381
 382     hwctx->internal->is_allocated = 1;
 383
 384     // Setting stream to NULL will make functions automatically use the default CUstream
 385     hwctx->stream = NULL;
 386
 387     return 0;
 388 }
 389
 390 static int cuda_flags_from_opts(AVHWDeviceContext *device_ctx,
 391                                 AVDictionary *opts, int *flags)
 392 {
 393     AVDictionaryEntry *primary_ctx_opt = av_dict_get(opts, "primary_ctx", NULL, 0);
 394     AVDictionaryEntry *current_ctx_opt = av_dict_get(opts, "current_ctx", NULL, 0);
 395
 396     int use_primary_ctx = 0, use_current_ctx = 0;
 397     if (primary_ctx_opt)
 398         use_primary_ctx = strtol(primary_ctx_opt->value, NULL, 10);
 399
 400     if (current_ctx_opt)
 401         use_current_ctx = strtol(current_ctx_opt->value, NULL, 10);
 402
 403     if (use_primary_ctx && use_current_ctx) {
 404         av_log(device_ctx, AV_LOG_ERROR, "Requested both primary and current CUDA context simultaneously.\n");
 405         return AVERROR(EINVAL);
 406     }
 407
 408     if (primary_ctx_opt && use_primary_ctx) {
 409         av_log(device_ctx, AV_LOG_VERBOSE, "Using CUDA primary device context\n");
 410         *flags |= AV_CUDA_USE_PRIMARY_CONTEXT;
 411     } else if (primary_ctx_opt) {
 412         av_log(device_ctx, AV_LOG_VERBOSE, "Disabling use of CUDA primary device context\n");
 413         *flags &= ~AV_CUDA_USE_PRIMARY_CONTEXT;
 414     }
 415
 416     if (current_ctx_opt && use_current_ctx) {
 417         av_log(device_ctx, AV_LOG_VERBOSE, "Using CUDA current device context\n");
 418         *flags |= AV_CUDA_USE_CURRENT_CONTEXT;
 419     } else if (current_ctx_opt) {
 420         av_log(device_ctx, AV_LOG_VERBOSE, "Disabling use of CUDA current device context\n");
 421         *flags &= ~AV_CUDA_USE_CURRENT_CONTEXT;
 422     }
 423
 424     return 0;
 425 }
 426
 427 static int cuda_device_create(AVHWDeviceContext *device_ctx,
 428                               const char *device,
 429                               AVDictionary *opts, int flags)
 430 {
 431     AVCUDADeviceContext *hwctx = device_ctx->hwctx;
 432     CudaFunctions *cu;
 433     int ret, device_idx = 0;
 434
 435     ret = cuda_flags_from_opts(device_ctx, opts, &flags);
 436     if (ret < 0)
 437         goto error;
 438
 439     if (device)
 440         device_idx = strtol(device, NULL, 0);
 441
 442     ret = cuda_device_init(device_ctx);
 443     if (ret < 0)
 444         goto error;
 445
 446     cu = hwctx->internal->cuda_dl;
 447
 448     ret = CHECK_CU(cu->cuInit(0));
 449     if (ret < 0)
 450         goto error;
 451
 452     ret = CHECK_CU(cu->cuDeviceGet(&hwctx->internal->cuda_device, device_idx));
 453     if (ret < 0)
 454         goto error;
 455
 456     ret = cuda_context_init(device_ctx, flags);
 457     if (ret < 0)
 458         goto error;
 459
 460     return 0;
 461
 462 error:
 463     cuda_device_uninit(device_ctx);
 464     return ret;
 465 }
 466
 467 static int cuda_device_derive(AVHWDeviceContext *device_ctx,
 468                               AVHWDeviceContext *src_ctx, AVDictionary *opts,
 469                               int flags) {
 470     AVCUDADeviceContext *hwctx = device_ctx->hwctx;
 471     CudaFunctions *cu;
 472     const char *src_uuid = NULL;
 473 #if CONFIG_VULKAN
 474     VkPhysicalDeviceIDProperties vk_idp;
 475 #endif
 476     int ret, i, device_count;
 477
 478     ret = cuda_flags_from_opts(device_ctx, opts, &flags);
 479     if (ret < 0)
 480         goto error;
 481
 482 #if CONFIG_VULKAN
 483     vk_idp = (VkPhysicalDeviceIDProperties) {
 484         .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES,
 485     };
 486 #endif
 487
 488     switch (src_ctx->type) {
 489 #if CONFIG_VULKAN
 490 #define TYPE PFN_vkGetPhysicalDeviceProperties2
 491     case AV_HWDEVICE_TYPE_VULKAN: {
 492         AVVulkanDeviceContext *vkctx = src_ctx->hwctx;
 493         TYPE prop_fn = (TYPE)vkctx->get_proc_addr(vkctx->inst, "vkGetPhysicalDeviceProperties2");
 494         VkPhysicalDeviceProperties2 vk_dev_props = {
 495             .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2,
 496             .pNext = &vk_idp,
 497         };
 498         prop_fn(vkctx->phys_dev, &vk_dev_props);
 499         src_uuid = vk_idp.deviceUUID;
 500         break;
 501     }
 502 #undef TYPE
 503 #endif
 504     default:
 505         ret = AVERROR(ENOSYS);
 506         goto error;
 507     }
 508
 509     if (!src_uuid) {
 510         av_log(device_ctx, AV_LOG_ERROR,
 511                "Failed to get UUID of source device.\n");
 512         ret = AVERROR(EINVAL);
 513         goto error;
 514     }
 515
 516     ret = cuda_device_init(device_ctx);
 517     if (ret < 0)
 518         goto error;
 519
 520     cu = hwctx->internal->cuda_dl;
 521
 522     ret = CHECK_CU(cu->cuInit(0));
 523     if (ret < 0)
 524         goto error;
 525
 526     ret = CHECK_CU(cu->cuDeviceGetCount(&device_count));
 527     if (ret < 0)
 528         goto error;
 529
 530     hwctx->internal->cuda_device = -1;
 531     for (i = 0; i < device_count; i++) {
 532         CUdevice dev;
 533         CUuuid uuid;
 534
 535         ret = CHECK_CU(cu->cuDeviceGet(&dev, i));
 536         if (ret < 0)
 537             goto error;
 538
 539         ret = CHECK_CU(cu->cuDeviceGetUuid(&uuid, dev));
 540         if (ret < 0)
 541             goto error;
 542
 543         if (memcmp(src_uuid, uuid.bytes, sizeof (uuid.bytes)) == 0) {
 544             hwctx->internal->cuda_device = dev;
 545             break;
 546         }
 547     }
 548
 549     if (hwctx->internal->cuda_device == -1) {
 550         av_log(device_ctx, AV_LOG_ERROR, "Could not derive CUDA device.\n");
 551         goto error;
 552     }
 553
 554     ret = cuda_context_init(device_ctx, flags);
 555     if (ret < 0)
 556         goto error;
 557
 558     return 0;
 559
 560 error:
 561     cuda_device_uninit(device_ctx);
 562     return ret;
 563 }
 564
 565 const HWContextType ff_hwcontext_type_cuda = {
 566     .type                 = AV_HWDEVICE_TYPE_CUDA,
 567     .name                 = "CUDA",
 568
 569     .device_hwctx_size    = sizeof(CUDADeviceContext),
 570     .frames_hwctx_size    = sizeof(CUDAFramesContext),
 571
 572     .device_create        = cuda_device_create,
 573     .device_derive        = cuda_device_derive,
 574     .device_init          = cuda_device_init,
 575     .device_uninit        = cuda_device_uninit,
 576     .frames_get_constraints = cuda_frames_get_constraints,
 577     .frames_init          = cuda_frames_init,
 578     .frames_get_buffer    = cuda_get_buffer,
 579     .transfer_get_formats = cuda_transfer_get_formats,
 580     .transfer_data_to     = cuda_transfer_data,
 581     .transfer_data_from   = cuda_transfer_data,
 582
 583     .pix_fmts             = (const enum AVPixelFormat[]){ AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE },
 584 };