| /* |
| * This file is part of FFmpeg. |
| * |
| * FFmpeg is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU Lesser General Public |
| * License as published by the Free Software Foundation; either |
| * version 2.1 of the License, or (at your option) any later version. |
| * |
| * FFmpeg is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| * Lesser General Public License for more details. |
| * |
| * You should have received a copy of the GNU Lesser General Public |
| * License along with FFmpeg; if not, write to the Free Software |
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| */ |
| |
| #include "buffer.h" |
| #include "common.h" |
| #include "hwcontext.h" |
| #include "hwcontext_internal.h" |
| #include "hwcontext_cuda_internal.h" |
| #if CONFIG_VULKAN |
| #include "hwcontext_vulkan.h" |
| #endif |
| #include "cuda_check.h" |
| #include "mem.h" |
| #include "pixdesc.h" |
| #include "pixfmt.h" |
| #include "imgutils.h" |
| |
| #define CUDA_FRAME_ALIGNMENT 256 |
| |
| typedef struct CUDAFramesContext { |
| int shift_width, shift_height; |
| } CUDAFramesContext; |
| |
| static const enum AVPixelFormat supported_formats[] = { |
| AV_PIX_FMT_NV12, |
| AV_PIX_FMT_YUV420P, |
| AV_PIX_FMT_YUVA420P, |
| AV_PIX_FMT_YUV444P, |
| AV_PIX_FMT_P010, |
| AV_PIX_FMT_P016, |
| AV_PIX_FMT_YUV444P16, |
| AV_PIX_FMT_0RGB32, |
| AV_PIX_FMT_0BGR32, |
| #if CONFIG_VULKAN |
| AV_PIX_FMT_VULKAN, |
| #endif |
| }; |
| |
| #define CHECK_CU(x) FF_CUDA_CHECK_DL(device_ctx, cu, x) |
| |
| static int cuda_frames_get_constraints(AVHWDeviceContext *ctx, |
| const void *hwconfig, |
| AVHWFramesConstraints *constraints) |
| { |
| int i; |
| |
| constraints->valid_sw_formats = av_malloc_array(FF_ARRAY_ELEMS(supported_formats) + 1, |
| sizeof(*constraints->valid_sw_formats)); |
| if (!constraints->valid_sw_formats) |
| return AVERROR(ENOMEM); |
| |
| for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) |
| constraints->valid_sw_formats[i] = supported_formats[i]; |
| constraints->valid_sw_formats[FF_ARRAY_ELEMS(supported_formats)] = AV_PIX_FMT_NONE; |
| |
| constraints->valid_hw_formats = av_malloc_array(2, sizeof(*constraints->valid_hw_formats)); |
| if (!constraints->valid_hw_formats) |
| return AVERROR(ENOMEM); |
| |
| constraints->valid_hw_formats[0] = AV_PIX_FMT_CUDA; |
| constraints->valid_hw_formats[1] = AV_PIX_FMT_NONE; |
| |
| return 0; |
| } |
| |
| static void cuda_buffer_free(void *opaque, uint8_t *data) |
| { |
| AVHWFramesContext *ctx = opaque; |
| AVHWDeviceContext *device_ctx = ctx->device_ctx; |
| AVCUDADeviceContext *hwctx = device_ctx->hwctx; |
| CudaFunctions *cu = hwctx->internal->cuda_dl; |
| |
| CUcontext dummy; |
| |
| CHECK_CU(cu->cuCtxPushCurrent(hwctx->cuda_ctx)); |
| |
| CHECK_CU(cu->cuMemFree((CUdeviceptr)data)); |
| |
| CHECK_CU(cu->cuCtxPopCurrent(&dummy)); |
| } |
| |
| static AVBufferRef *cuda_pool_alloc(void *opaque, int size) |
| { |
| AVHWFramesContext *ctx = opaque; |
| AVHWDeviceContext *device_ctx = ctx->device_ctx; |
| AVCUDADeviceContext *hwctx = device_ctx->hwctx; |
| CudaFunctions *cu = hwctx->internal->cuda_dl; |
| |
| AVBufferRef *ret = NULL; |
| CUcontext dummy = NULL; |
| CUdeviceptr data; |
| int err; |
| |
| err = CHECK_CU(cu->cuCtxPushCurrent(hwctx->cuda_ctx)); |
| if (err < 0) |
| return NULL; |
| |
| err = CHECK_CU(cu->cuMemAlloc(&data, size)); |
| if (err < 0) |
| goto fail; |
| |
| ret = av_buffer_create((uint8_t*)data, size, cuda_buffer_free, ctx, 0); |
| if (!ret) { |
| CHECK_CU(cu->cuMemFree(data)); |
| goto fail; |
| } |
| |
| fail: |
| CHECK_CU(cu->cuCtxPopCurrent(&dummy)); |
| return ret; |
| } |
| |
| static int cuda_frames_init(AVHWFramesContext *ctx) |
| { |
| CUDAFramesContext *priv = ctx->internal->priv; |
| int i; |
| |
| for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) { |
| if (ctx->sw_format == supported_formats[i]) |
| break; |
| } |
| if (i == FF_ARRAY_ELEMS(supported_formats)) { |
| av_log(ctx, AV_LOG_ERROR, "Pixel format '%s' is not supported\n", |
| av_get_pix_fmt_name(ctx->sw_format)); |
| return AVERROR(ENOSYS); |
| } |
| |
| av_pix_fmt_get_chroma_sub_sample(ctx->sw_format, &priv->shift_width, &priv->shift_height); |
| |
| if (!ctx->pool) { |
| int size = av_image_get_buffer_size(ctx->sw_format, ctx->width, ctx->height, CUDA_FRAME_ALIGNMENT); |
| if (size < 0) |
| return size; |
| |
| ctx->internal->pool_internal = av_buffer_pool_init2(size, ctx, cuda_pool_alloc, NULL); |
| if (!ctx->internal->pool_internal) |
| return AVERROR(ENOMEM); |
| } |
| |
| return 0; |
| } |
| |
| static int cuda_get_buffer(AVHWFramesContext *ctx, AVFrame *frame) |
| { |
| int res; |
| |
| frame->buf[0] = av_buffer_pool_get(ctx->pool); |
| if (!frame->buf[0]) |
| return AVERROR(ENOMEM); |
| |
| res = av_image_fill_arrays(frame->data, frame->linesize, frame->buf[0]->data, |
| ctx->sw_format, ctx->width, ctx->height, CUDA_FRAME_ALIGNMENT); |
| if (res < 0) |
| return res; |
| |
| // YUV420P is a special case. |
| // Nvenc expects the U/V planes in swapped order from how ffmpeg expects them, also chroma is half-aligned |
| if (ctx->sw_format == AV_PIX_FMT_YUV420P) { |
| frame->linesize[1] = frame->linesize[2] = frame->linesize[0] / 2; |
| frame->data[2] = frame->data[1]; |
| frame->data[1] = frame->data[2] + frame->linesize[2] * ctx->height / 2; |
| } |
| |
| frame->format = AV_PIX_FMT_CUDA; |
| frame->width = ctx->width; |
| frame->height = ctx->height; |
| |
| return 0; |
| } |
| |
| static int cuda_transfer_get_formats(AVHWFramesContext *ctx, |
| enum AVHWFrameTransferDirection dir, |
| enum AVPixelFormat **formats) |
| { |
| enum AVPixelFormat *fmts; |
| |
| fmts = av_malloc_array(2, sizeof(*fmts)); |
| if (!fmts) |
| return AVERROR(ENOMEM); |
| |
| fmts[0] = ctx->sw_format; |
| fmts[1] = AV_PIX_FMT_NONE; |
| |
| *formats = fmts; |
| |
| return 0; |
| } |
| |
| static int cuda_transfer_data(AVHWFramesContext *ctx, AVFrame *dst, |
| const AVFrame *src) |
| { |
| CUDAFramesContext *priv = ctx->internal->priv; |
| AVHWDeviceContext *device_ctx = ctx->device_ctx; |
| AVCUDADeviceContext *hwctx = device_ctx->hwctx; |
| CudaFunctions *cu = hwctx->internal->cuda_dl; |
| |
| CUcontext dummy; |
| int i, ret; |
| |
| if ((src->hw_frames_ctx && ((AVHWFramesContext*)src->hw_frames_ctx->data)->format != AV_PIX_FMT_CUDA) || |
| (dst->hw_frames_ctx && ((AVHWFramesContext*)dst->hw_frames_ctx->data)->format != AV_PIX_FMT_CUDA)) |
| return AVERROR(ENOSYS); |
| |
| ret = CHECK_CU(cu->cuCtxPushCurrent(hwctx->cuda_ctx)); |
| if (ret < 0) |
| return ret; |
| |
| for (i = 0; i < FF_ARRAY_ELEMS(src->data) && src->data[i]; i++) { |
| CUDA_MEMCPY2D cpy = { |
| .srcPitch = src->linesize[i], |
| .dstPitch = dst->linesize[i], |
| .WidthInBytes = FFMIN(src->linesize[i], dst->linesize[i]), |
| .Height = src->height >> ((i == 0 || i == 3) ? 0 : priv->shift_height), |
| }; |
| |
| if (src->hw_frames_ctx) { |
| cpy.srcMemoryType = CU_MEMORYTYPE_DEVICE; |
| cpy.srcDevice = (CUdeviceptr)src->data[i]; |
| } else { |
| cpy.srcMemoryType = CU_MEMORYTYPE_HOST; |
| cpy.srcHost = src->data[i]; |
| } |
| |
| if (dst->hw_frames_ctx) { |
| cpy.dstMemoryType = CU_MEMORYTYPE_DEVICE; |
| cpy.dstDevice = (CUdeviceptr)dst->data[i]; |
| } else { |
| cpy.dstMemoryType = CU_MEMORYTYPE_HOST; |
| cpy.dstHost = dst->data[i]; |
| } |
| |
| ret = CHECK_CU(cu->cuMemcpy2DAsync(&cpy, hwctx->stream)); |
| if (ret < 0) |
| goto exit; |
| } |
| |
| if (!dst->hw_frames_ctx) { |
| ret = CHECK_CU(cu->cuStreamSynchronize(hwctx->stream)); |
| if (ret < 0) |
| goto exit; |
| } |
| |
| exit: |
| CHECK_CU(cu->cuCtxPopCurrent(&dummy)); |
| |
| return 0; |
| } |
| |
| static void cuda_device_uninit(AVHWDeviceContext *device_ctx) |
| { |
| AVCUDADeviceContext *hwctx = device_ctx->hwctx; |
| |
| if (hwctx->internal) { |
| CudaFunctions *cu = hwctx->internal->cuda_dl; |
| |
| if (hwctx->internal->is_allocated && hwctx->cuda_ctx) { |
| if (hwctx->internal->flags & AV_CUDA_USE_PRIMARY_CONTEXT) |
| CHECK_CU(cu->cuDevicePrimaryCtxRelease(hwctx->internal->cuda_device)); |
| else |
| CHECK_CU(cu->cuCtxDestroy(hwctx->cuda_ctx)); |
| |
| hwctx->cuda_ctx = NULL; |
| } |
| |
| cuda_free_functions(&hwctx->internal->cuda_dl); |
| } |
| |
| av_freep(&hwctx->internal); |
| } |
| |
| static int cuda_device_init(AVHWDeviceContext *ctx) |
| { |
| AVCUDADeviceContext *hwctx = ctx->hwctx; |
| int ret; |
| |
| if (!hwctx->internal) { |
| hwctx->internal = av_mallocz(sizeof(*hwctx->internal)); |
| if (!hwctx->internal) |
| return AVERROR(ENOMEM); |
| } |
| |
| if (!hwctx->internal->cuda_dl) { |
| ret = cuda_load_functions(&hwctx->internal->cuda_dl, ctx); |
| if (ret < 0) { |
| av_log(ctx, AV_LOG_ERROR, "Could not dynamically load CUDA\n"); |
| goto error; |
| } |
| } |
| |
| return 0; |
| |
| error: |
| cuda_device_uninit(ctx); |
| return ret; |
| } |
| |
| static int cuda_context_init(AVHWDeviceContext *device_ctx, int flags) { |
| AVCUDADeviceContext *hwctx = device_ctx->hwctx; |
| CudaFunctions *cu; |
| CUcontext dummy; |
| int ret, dev_active = 0; |
| unsigned int dev_flags = 0; |
| |
| const unsigned int desired_flags = CU_CTX_SCHED_BLOCKING_SYNC; |
| |
| cu = hwctx->internal->cuda_dl; |
| |
| hwctx->internal->flags = flags; |
| |
| if (flags & AV_CUDA_USE_PRIMARY_CONTEXT) { |
| ret = CHECK_CU(cu->cuDevicePrimaryCtxGetState(hwctx->internal->cuda_device, |
| &dev_flags, &dev_active)); |
| if (ret < 0) |
| return ret; |
| |
| if (dev_active && dev_flags != desired_flags) { |
| av_log(device_ctx, AV_LOG_ERROR, "Primary context already active with incompatible flags.\n"); |
| return AVERROR(ENOTSUP); |
| } else if (dev_flags != desired_flags) { |
| ret = CHECK_CU(cu->cuDevicePrimaryCtxSetFlags(hwctx->internal->cuda_device, |
| desired_flags)); |
| if (ret < 0) |
| return ret; |
| } |
| |
| ret = CHECK_CU(cu->cuDevicePrimaryCtxRetain(&hwctx->cuda_ctx, |
| hwctx->internal->cuda_device)); |
| if (ret < 0) |
| return ret; |
| } else { |
| ret = CHECK_CU(cu->cuCtxCreate(&hwctx->cuda_ctx, desired_flags, |
| hwctx->internal->cuda_device)); |
| if (ret < 0) |
| return ret; |
| |
| CHECK_CU(cu->cuCtxPopCurrent(&dummy)); |
| } |
| |
| hwctx->internal->is_allocated = 1; |
| |
| // Setting stream to NULL will make functions automatically use the default CUstream |
| hwctx->stream = NULL; |
| |
| return 0; |
| } |
| |
| static int cuda_device_create(AVHWDeviceContext *device_ctx, |
| const char *device, |
| AVDictionary *opts, int flags) |
| { |
| AVCUDADeviceContext *hwctx = device_ctx->hwctx; |
| CudaFunctions *cu; |
| int ret, device_idx = 0; |
| |
| if (device) |
| device_idx = strtol(device, NULL, 0); |
| |
| if (cuda_device_init(device_ctx) < 0) |
| goto error; |
| |
| cu = hwctx->internal->cuda_dl; |
| |
| ret = CHECK_CU(cu->cuInit(0)); |
| if (ret < 0) |
| goto error; |
| |
| ret = CHECK_CU(cu->cuDeviceGet(&hwctx->internal->cuda_device, device_idx)); |
| if (ret < 0) |
| goto error; |
| |
| ret = cuda_context_init(device_ctx, flags); |
| if (ret < 0) |
| goto error; |
| |
| return 0; |
| |
| error: |
| cuda_device_uninit(device_ctx); |
| return AVERROR_UNKNOWN; |
| } |
| |
| static int cuda_device_derive(AVHWDeviceContext *device_ctx, |
| AVHWDeviceContext *src_ctx, |
| int flags) { |
| AVCUDADeviceContext *hwctx = device_ctx->hwctx; |
| CudaFunctions *cu; |
| const char *src_uuid = NULL; |
| int ret, i, device_count; |
| |
| #if CONFIG_VULKAN |
| VkPhysicalDeviceIDProperties vk_idp = { |
| .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES, |
| }; |
| #endif |
| |
| switch (src_ctx->type) { |
| #if CONFIG_VULKAN |
| case AV_HWDEVICE_TYPE_VULKAN: { |
| AVVulkanDeviceContext *vkctx = src_ctx->hwctx; |
| VkPhysicalDeviceProperties2 vk_dev_props = { |
| .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2, |
| .pNext = &vk_idp, |
| }; |
| vkGetPhysicalDeviceProperties2(vkctx->phys_dev, &vk_dev_props); |
| src_uuid = vk_idp.deviceUUID; |
| break; |
| } |
| #endif |
| default: |
| return AVERROR(ENOSYS); |
| } |
| |
| if (!src_uuid) { |
| av_log(device_ctx, AV_LOG_ERROR, |
| "Failed to get UUID of source device.\n"); |
| goto error; |
| } |
| |
| if (cuda_device_init(device_ctx) < 0) |
| goto error; |
| |
| cu = hwctx->internal->cuda_dl; |
| |
| ret = CHECK_CU(cu->cuInit(0)); |
| if (ret < 0) |
| goto error; |
| |
| ret = CHECK_CU(cu->cuDeviceGetCount(&device_count)); |
| if (ret < 0) |
| goto error; |
| |
| hwctx->internal->cuda_device = -1; |
| for (i = 0; i < device_count; i++) { |
| CUdevice dev; |
| CUuuid uuid; |
| |
| ret = CHECK_CU(cu->cuDeviceGet(&dev, i)); |
| if (ret < 0) |
| goto error; |
| |
| ret = CHECK_CU(cu->cuDeviceGetUuid(&uuid, dev)); |
| if (ret < 0) |
| goto error; |
| |
| if (memcmp(src_uuid, uuid.bytes, sizeof (uuid.bytes)) == 0) { |
| hwctx->internal->cuda_device = dev; |
| break; |
| } |
| } |
| |
| if (hwctx->internal->cuda_device == -1) { |
| av_log(device_ctx, AV_LOG_ERROR, "Could not derive CUDA device.\n"); |
| goto error; |
| } |
| |
| ret = cuda_context_init(device_ctx, flags); |
| if (ret < 0) |
| goto error; |
| |
| return 0; |
| |
| error: |
| cuda_device_uninit(device_ctx); |
| return AVERROR_UNKNOWN; |
| } |
| |
| const HWContextType ff_hwcontext_type_cuda = { |
| .type = AV_HWDEVICE_TYPE_CUDA, |
| .name = "CUDA", |
| |
| .device_hwctx_size = sizeof(AVCUDADeviceContext), |
| .frames_priv_size = sizeof(CUDAFramesContext), |
| |
| .device_create = cuda_device_create, |
| .device_derive = cuda_device_derive, |
| .device_init = cuda_device_init, |
| .device_uninit = cuda_device_uninit, |
| .frames_get_constraints = cuda_frames_get_constraints, |
| .frames_init = cuda_frames_init, |
| .frames_get_buffer = cuda_get_buffer, |
| .transfer_get_formats = cuda_transfer_get_formats, |
| .transfer_data_to = cuda_transfer_data, |
| .transfer_data_from = cuda_transfer_data, |
| |
| .pix_fmts = (const enum AVPixelFormat[]){ AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE }, |
| }; |