2 * This file is part of FFmpeg.
4 * FFmpeg is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
9 * FFmpeg is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with FFmpeg; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "hwcontext.h"
22 #include "hwcontext_internal.h"
23 #include "hwcontext_cuda_internal.h"
25 #include "hwcontext_vulkan.h"
27 #include "cuda_check.h"
33 typedef struct CUDAFramesContext
{
34 int shift_width
, shift_height
;
38 typedef struct CUDADeviceContext
{
39 AVCUDADeviceContext p
;
40 AVCUDADeviceContextInternal internal
;
43 static const enum AVPixelFormat supported_formats
[] = {
62 #define CHECK_CU(x) FF_CUDA_CHECK_DL(device_ctx, cu, x)
64 static int cuda_frames_get_constraints(AVHWDeviceContext
*ctx
,
66 AVHWFramesConstraints
*constraints
)
70 constraints
->valid_sw_formats
= av_malloc_array(FF_ARRAY_ELEMS(supported_formats
) + 1,
71 sizeof(*constraints
->valid_sw_formats
));
72 if (!constraints
->valid_sw_formats
)
73 return AVERROR(ENOMEM
);
75 for (i
= 0; i
< FF_ARRAY_ELEMS(supported_formats
); i
++)
76 constraints
->valid_sw_formats
[i
] = supported_formats
[i
];
77 constraints
->valid_sw_formats
[FF_ARRAY_ELEMS(supported_formats
)] = AV_PIX_FMT_NONE
;
79 constraints
->valid_hw_formats
= av_malloc_array(2, sizeof(*constraints
->valid_hw_formats
));
80 if (!constraints
->valid_hw_formats
)
81 return AVERROR(ENOMEM
);
83 constraints
->valid_hw_formats
[0] = AV_PIX_FMT_CUDA
;
84 constraints
->valid_hw_formats
[1] = AV_PIX_FMT_NONE
;
89 static void cuda_buffer_free(void *opaque
, uint8_t *data
)
91 AVHWFramesContext
*ctx
= opaque
;
92 AVHWDeviceContext
*device_ctx
= ctx
->device_ctx
;
93 AVCUDADeviceContext
*hwctx
= device_ctx
->hwctx
;
94 CudaFunctions
*cu
= hwctx
->internal
->cuda_dl
;
98 CHECK_CU(cu
->cuCtxPushCurrent(hwctx
->cuda_ctx
));
100 CHECK_CU(cu
->cuMemFree((CUdeviceptr
)data
));
102 CHECK_CU(cu
->cuCtxPopCurrent(&dummy
));
105 static AVBufferRef
*cuda_pool_alloc(void *opaque
, size_t size
)
107 AVHWFramesContext
*ctx
= opaque
;
108 AVHWDeviceContext
*device_ctx
= ctx
->device_ctx
;
109 AVCUDADeviceContext
*hwctx
= device_ctx
->hwctx
;
110 CudaFunctions
*cu
= hwctx
->internal
->cuda_dl
;
112 AVBufferRef
*ret
= NULL
;
113 CUcontext dummy
= NULL
;
117 err
= CHECK_CU(cu
->cuCtxPushCurrent(hwctx
->cuda_ctx
));
121 err
= CHECK_CU(cu
->cuMemAlloc(&data
, size
));
125 ret
= av_buffer_create((uint8_t*)data
, size
, cuda_buffer_free
, ctx
, 0);
127 CHECK_CU(cu
->cuMemFree(data
));
132 CHECK_CU(cu
->cuCtxPopCurrent(&dummy
));
136 static int cuda_frames_init(AVHWFramesContext
*ctx
)
138 AVHWDeviceContext
*device_ctx
= ctx
->device_ctx
;
139 AVCUDADeviceContext
*hwctx
= device_ctx
->hwctx
;
140 CUDAFramesContext
*priv
= ctx
->hwctx
;
141 CudaFunctions
*cu
= hwctx
->internal
->cuda_dl
;
144 for (i
= 0; i
< FF_ARRAY_ELEMS(supported_formats
); i
++) {
145 if (ctx
->sw_format
== supported_formats
[i
])
148 if (i
== FF_ARRAY_ELEMS(supported_formats
)) {
149 av_log(ctx
, AV_LOG_ERROR
, "Pixel format '%s' is not supported\n",
150 av_get_pix_fmt_name(ctx
->sw_format
));
151 return AVERROR(ENOSYS
);
154 err
= CHECK_CU(cu
->cuDeviceGetAttribute(&priv
->tex_alignment
,
155 14 /* CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT */,
156 hwctx
->internal
->cuda_device
));
160 av_log(ctx
, AV_LOG_DEBUG
, "CUDA texture alignment: %d\n", priv
->tex_alignment
);
162 // YUV420P is a special case.
163 // Since nvenc expects the U/V planes to have half the linesize of the Y plane
164 // alignment has to be doubled to ensure the U/V planes still end up aligned.
165 if (ctx
->sw_format
== AV_PIX_FMT_YUV420P
)
166 priv
->tex_alignment
*= 2;
168 av_pix_fmt_get_chroma_sub_sample(ctx
->sw_format
, &priv
->shift_width
, &priv
->shift_height
);
171 int size
= av_image_get_buffer_size(ctx
->sw_format
, ctx
->width
, ctx
->height
, priv
->tex_alignment
);
175 ffhwframesctx(ctx
)->pool_internal
=
176 av_buffer_pool_init2(size
, ctx
, cuda_pool_alloc
, NULL
);
177 if (!ffhwframesctx(ctx
)->pool_internal
)
178 return AVERROR(ENOMEM
);
184 static int cuda_get_buffer(AVHWFramesContext
*ctx
, AVFrame
*frame
)
186 CUDAFramesContext
*priv
= ctx
->hwctx
;
189 frame
->buf
[0] = av_buffer_pool_get(ctx
->pool
);
191 return AVERROR(ENOMEM
);
193 res
= av_image_fill_arrays(frame
->data
, frame
->linesize
, frame
->buf
[0]->data
,
194 ctx
->sw_format
, ctx
->width
, ctx
->height
, priv
->tex_alignment
);
198 // YUV420P is a special case.
199 // Nvenc expects the U/V planes in swapped order from how ffmpeg expects them, also chroma is half-aligned
200 if (ctx
->sw_format
== AV_PIX_FMT_YUV420P
) {
201 frame
->linesize
[1] = frame
->linesize
[2] = frame
->linesize
[0] / 2;
202 frame
->data
[2] = frame
->data
[1];
203 frame
->data
[1] = frame
->data
[2] + frame
->linesize
[2] * (ctx
->height
/ 2);
206 frame
->format
= AV_PIX_FMT_CUDA
;
207 frame
->width
= ctx
->width
;
208 frame
->height
= ctx
->height
;
213 static int cuda_transfer_get_formats(AVHWFramesContext
*ctx
,
214 enum AVHWFrameTransferDirection dir
,
215 enum AVPixelFormat
**formats
)
217 enum AVPixelFormat
*fmts
;
219 fmts
= av_malloc_array(2, sizeof(*fmts
));
221 return AVERROR(ENOMEM
);
223 fmts
[0] = ctx
->sw_format
;
224 fmts
[1] = AV_PIX_FMT_NONE
;
231 static int cuda_transfer_data(AVHWFramesContext
*ctx
, AVFrame
*dst
,
234 CUDAFramesContext
*priv
= ctx
->hwctx
;
235 AVHWDeviceContext
*device_ctx
= ctx
->device_ctx
;
236 AVCUDADeviceContext
*hwctx
= device_ctx
->hwctx
;
237 CudaFunctions
*cu
= hwctx
->internal
->cuda_dl
;
242 if ((src
->hw_frames_ctx
&& ((AVHWFramesContext
*)src
->hw_frames_ctx
->data
)->format
!= AV_PIX_FMT_CUDA
) ||
243 (dst
->hw_frames_ctx
&& ((AVHWFramesContext
*)dst
->hw_frames_ctx
->data
)->format
!= AV_PIX_FMT_CUDA
))
244 return AVERROR(ENOSYS
);
246 ret
= CHECK_CU(cu
->cuCtxPushCurrent(hwctx
->cuda_ctx
));
250 for (i
= 0; i
< FF_ARRAY_ELEMS(src
->data
) && src
->data
[i
]; i
++) {
251 CUDA_MEMCPY2D cpy
= {
252 .srcPitch
= src
->linesize
[i
],
253 .dstPitch
= dst
->linesize
[i
],
254 .WidthInBytes
= FFMIN(src
->linesize
[i
], dst
->linesize
[i
]),
255 .Height
= src
->height
>> ((i
== 0 || i
== 3) ? 0 : priv
->shift_height
),
258 if (src
->hw_frames_ctx
) {
259 cpy
.srcMemoryType
= CU_MEMORYTYPE_DEVICE
;
260 cpy
.srcDevice
= (CUdeviceptr
)src
->data
[i
];
262 cpy
.srcMemoryType
= CU_MEMORYTYPE_HOST
;
263 cpy
.srcHost
= src
->data
[i
];
266 if (dst
->hw_frames_ctx
) {
267 cpy
.dstMemoryType
= CU_MEMORYTYPE_DEVICE
;
268 cpy
.dstDevice
= (CUdeviceptr
)dst
->data
[i
];
270 cpy
.dstMemoryType
= CU_MEMORYTYPE_HOST
;
271 cpy
.dstHost
= dst
->data
[i
];
274 ret
= CHECK_CU(cu
->cuMemcpy2DAsync(&cpy
, hwctx
->stream
));
279 if (!dst
->hw_frames_ctx
) {
280 ret
= CHECK_CU(cu
->cuStreamSynchronize(hwctx
->stream
));
286 CHECK_CU(cu
->cuCtxPopCurrent(&dummy
));
291 static void cuda_device_uninit(AVHWDeviceContext
*device_ctx
)
293 CUDADeviceContext
*hwctx
= device_ctx
->hwctx
;
295 if (hwctx
->p
.internal
) {
296 CudaFunctions
*cu
= hwctx
->internal
.cuda_dl
;
298 if (hwctx
->internal
.is_allocated
&& hwctx
->p
.cuda_ctx
) {
299 if (hwctx
->internal
.flags
& AV_CUDA_USE_PRIMARY_CONTEXT
)
300 CHECK_CU(cu
->cuDevicePrimaryCtxRelease(hwctx
->internal
.cuda_device
));
301 else if (!(hwctx
->internal
.flags
& AV_CUDA_USE_CURRENT_CONTEXT
))
302 CHECK_CU(cu
->cuCtxDestroy(hwctx
->p
.cuda_ctx
));
304 hwctx
->p
.cuda_ctx
= NULL
;
307 cuda_free_functions(&hwctx
->internal
.cuda_dl
);
308 memset(&hwctx
->internal
, 0, sizeof(hwctx
->internal
));
309 hwctx
->p
.internal
= NULL
;
313 static int cuda_device_init(AVHWDeviceContext
*ctx
)
315 CUDADeviceContext
*hwctx
= ctx
->hwctx
;
318 hwctx
->p
.internal
= &hwctx
->internal
;
320 if (!hwctx
->internal
.cuda_dl
) {
321 ret
= cuda_load_functions(&hwctx
->internal
.cuda_dl
, ctx
);
323 av_log(ctx
, AV_LOG_ERROR
, "Could not dynamically load CUDA\n");
331 cuda_device_uninit(ctx
);
335 static int cuda_context_init(AVHWDeviceContext
*device_ctx
, int flags
) {
336 AVCUDADeviceContext
*hwctx
= device_ctx
->hwctx
;
339 int ret
, dev_active
= 0;
340 unsigned int dev_flags
= 0;
342 const unsigned int desired_flags
= CU_CTX_SCHED_BLOCKING_SYNC
;
344 cu
= hwctx
->internal
->cuda_dl
;
346 hwctx
->internal
->flags
= flags
;
348 if (flags
& AV_CUDA_USE_PRIMARY_CONTEXT
) {
349 ret
= CHECK_CU(cu
->cuDevicePrimaryCtxGetState(hwctx
->internal
->cuda_device
,
350 &dev_flags
, &dev_active
));
354 if (dev_active
&& dev_flags
!= desired_flags
) {
355 av_log(device_ctx
, AV_LOG_ERROR
, "Primary context already active with incompatible flags.\n");
356 return AVERROR(ENOTSUP
);
357 } else if (dev_flags
!= desired_flags
) {
358 ret
= CHECK_CU(cu
->cuDevicePrimaryCtxSetFlags(hwctx
->internal
->cuda_device
,
364 ret
= CHECK_CU(cu
->cuDevicePrimaryCtxRetain(&hwctx
->cuda_ctx
,
365 hwctx
->internal
->cuda_device
));
368 } else if (flags
& AV_CUDA_USE_CURRENT_CONTEXT
) {
369 ret
= CHECK_CU(cu
->cuCtxGetCurrent(&hwctx
->cuda_ctx
));
372 av_log(device_ctx
, AV_LOG_INFO
, "Using current CUDA context.\n");
374 ret
= CHECK_CU(cu
->cuCtxCreate(&hwctx
->cuda_ctx
, desired_flags
,
375 hwctx
->internal
->cuda_device
));
379 CHECK_CU(cu
->cuCtxPopCurrent(&dummy
));
382 hwctx
->internal
->is_allocated
= 1;
384 // Setting stream to NULL will make functions automatically use the default CUstream
385 hwctx
->stream
= NULL
;
390 static int cuda_flags_from_opts(AVHWDeviceContext
*device_ctx
,
391 AVDictionary
*opts
, int *flags
)
393 AVDictionaryEntry
*primary_ctx_opt
= av_dict_get(opts
, "primary_ctx", NULL
, 0);
394 AVDictionaryEntry
*current_ctx_opt
= av_dict_get(opts
, "current_ctx", NULL
, 0);
396 int use_primary_ctx
= 0, use_current_ctx
= 0;
398 use_primary_ctx
= strtol(primary_ctx_opt
->value
, NULL
, 10);
401 use_current_ctx
= strtol(current_ctx_opt
->value
, NULL
, 10);
403 if (use_primary_ctx
&& use_current_ctx
) {
404 av_log(device_ctx
, AV_LOG_ERROR
, "Requested both primary and current CUDA context simultaneously.\n");
405 return AVERROR(EINVAL
);
408 if (primary_ctx_opt
&& use_primary_ctx
) {
409 av_log(device_ctx
, AV_LOG_VERBOSE
, "Using CUDA primary device context\n");
410 *flags
|= AV_CUDA_USE_PRIMARY_CONTEXT
;
411 } else if (primary_ctx_opt
) {
412 av_log(device_ctx
, AV_LOG_VERBOSE
, "Disabling use of CUDA primary device context\n");
413 *flags
&= ~AV_CUDA_USE_PRIMARY_CONTEXT
;
416 if (current_ctx_opt
&& use_current_ctx
) {
417 av_log(device_ctx
, AV_LOG_VERBOSE
, "Using CUDA current device context\n");
418 *flags
|= AV_CUDA_USE_CURRENT_CONTEXT
;
419 } else if (current_ctx_opt
) {
420 av_log(device_ctx
, AV_LOG_VERBOSE
, "Disabling use of CUDA current device context\n");
421 *flags
&= ~AV_CUDA_USE_CURRENT_CONTEXT
;
427 static int cuda_device_create(AVHWDeviceContext
*device_ctx
,
429 AVDictionary
*opts
, int flags
)
431 AVCUDADeviceContext
*hwctx
= device_ctx
->hwctx
;
433 int ret
, device_idx
= 0;
435 ret
= cuda_flags_from_opts(device_ctx
, opts
, &flags
);
440 device_idx
= strtol(device
, NULL
, 0);
442 ret
= cuda_device_init(device_ctx
);
446 cu
= hwctx
->internal
->cuda_dl
;
448 ret
= CHECK_CU(cu
->cuInit(0));
452 ret
= CHECK_CU(cu
->cuDeviceGet(&hwctx
->internal
->cuda_device
, device_idx
));
456 ret
= cuda_context_init(device_ctx
, flags
);
463 cuda_device_uninit(device_ctx
);
467 static int cuda_device_derive(AVHWDeviceContext
*device_ctx
,
468 AVHWDeviceContext
*src_ctx
, AVDictionary
*opts
,
470 AVCUDADeviceContext
*hwctx
= device_ctx
->hwctx
;
472 const char *src_uuid
= NULL
;
474 VkPhysicalDeviceIDProperties vk_idp
;
476 int ret
, i
, device_count
;
478 ret
= cuda_flags_from_opts(device_ctx
, opts
, &flags
);
483 vk_idp
= (VkPhysicalDeviceIDProperties
) {
484 .sType
= VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES
,
488 switch (src_ctx
->type
) {
490 #define TYPE PFN_vkGetPhysicalDeviceProperties2
491 case AV_HWDEVICE_TYPE_VULKAN
: {
492 AVVulkanDeviceContext
*vkctx
= src_ctx
->hwctx
;
493 TYPE prop_fn
= (TYPE
)vkctx
->get_proc_addr(vkctx
->inst
, "vkGetPhysicalDeviceProperties2");
494 VkPhysicalDeviceProperties2 vk_dev_props
= {
495 .sType
= VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2
,
498 prop_fn(vkctx
->phys_dev
, &vk_dev_props
);
499 src_uuid
= vk_idp
.deviceUUID
;
505 ret
= AVERROR(ENOSYS
);
510 av_log(device_ctx
, AV_LOG_ERROR
,
511 "Failed to get UUID of source device.\n");
512 ret
= AVERROR(EINVAL
);
516 ret
= cuda_device_init(device_ctx
);
520 cu
= hwctx
->internal
->cuda_dl
;
522 ret
= CHECK_CU(cu
->cuInit(0));
526 ret
= CHECK_CU(cu
->cuDeviceGetCount(&device_count
));
530 hwctx
->internal
->cuda_device
= -1;
531 for (i
= 0; i
< device_count
; i
++) {
535 ret
= CHECK_CU(cu
->cuDeviceGet(&dev
, i
));
539 ret
= CHECK_CU(cu
->cuDeviceGetUuid(&uuid
, dev
));
543 if (memcmp(src_uuid
, uuid
.bytes
, sizeof (uuid
.bytes
)) == 0) {
544 hwctx
->internal
->cuda_device
= dev
;
549 if (hwctx
->internal
->cuda_device
== -1) {
550 av_log(device_ctx
, AV_LOG_ERROR
, "Could not derive CUDA device.\n");
554 ret
= cuda_context_init(device_ctx
, flags
);
561 cuda_device_uninit(device_ctx
);
565 const HWContextType ff_hwcontext_type_cuda
= {
566 .type
= AV_HWDEVICE_TYPE_CUDA
,
569 .device_hwctx_size
= sizeof(CUDADeviceContext
),
570 .frames_hwctx_size
= sizeof(CUDAFramesContext
),
572 .device_create
= cuda_device_create
,
573 .device_derive
= cuda_device_derive
,
574 .device_init
= cuda_device_init
,
575 .device_uninit
= cuda_device_uninit
,
576 .frames_get_constraints
= cuda_frames_get_constraints
,
577 .frames_init
= cuda_frames_init
,
578 .frames_get_buffer
= cuda_get_buffer
,
579 .transfer_get_formats
= cuda_transfer_get_formats
,
580 .transfer_data_to
= cuda_transfer_data
,
581 .transfer_data_from
= cuda_transfer_data
,
583 .pix_fmts
= (const enum AVPixelFormat
[]){ AV_PIX_FMT_CUDA
, AV_PIX_FMT_NONE
},