0
0
mirror of https://github.com/obsproject/obs-studio.git synced 2024-09-19 20:32:15 +02:00
obs-studio/plugins/obs-nvenc/nvenc-cuda.c
2024-08-08 22:52:40 +02:00

346 lines
9.1 KiB
C

#include "nvenc-internal.h"
#include "nvenc-helpers.h"
/*
* NVENC implementation using CUDA context and arrays
*/
/* ------------------------------------------------------------------------- */
/* CUDA Context management */
bool cuda_ctx_init(struct nvenc_data *enc, obs_data_t *settings,
const bool texture)
{
#ifdef _WIN32
if (texture)
return true;
#endif
int count;
CUdevice device;
int gpu = (int)obs_data_get_int(settings, "device");
#ifndef _WIN32
/* CUDA can do fairly efficient cross-GPU OpenGL mappings, allow it as
* a hidden option for experimentation. */
bool force_cuda_tex = obs_data_get_bool(settings, "force_cuda_tex");
#endif
if (gpu == -1)
gpu = 0;
CU_FAILED(cu->cuInit(0))
CU_FAILED(cu->cuDeviceGetCount(&count))
if (!count) {
NV_FAIL("No CUDA devices found");
return false;
}
#ifdef _WIN32
CU_FAILED(cu->cuDeviceGet(&device, gpu))
#else
if (!texture || force_cuda_tex) {
CU_FAILED(cu->cuDeviceGet(&device, gpu))
} else {
unsigned int ctx_count = 0;
CUdevice devices[2];
obs_enter_graphics();
CUresult res = cu->cuGLGetDevices(&ctx_count, devices, 2,
CU_GL_DEVICE_LIST_ALL);
obs_leave_graphics();
if (res != CUDA_SUCCESS || !ctx_count) {
/* Probably running on iGPU, should just fall back to
* non-texture encoder. */
if (res == CUDA_ERROR_INVALID_GRAPHICS_CONTEXT) {
info("Not running on NVIDIA GPU, falling back "
"to non-texture encoder");
} else {
const char *name, *desc;
if (cuda_get_error_desc(res, &name, &desc)) {
error("Failed to get a CUDA device for "
"the current OpenGL context: "
"%s: %s",
name, desc);
} else {
error("Failed to get a CUDA device for "
"the current OpenGL context: %d",
res);
}
}
return false;
}
/* Documentation indicates this should only ever happen with
* SLI, i.e. never for OBS. */
if (ctx_count > 1) {
warn("Got more than one CUDA devices for OpenGL context,"
" this is untested.");
}
device = devices[0];
debug("Loading up CUDA on device %u", device);
}
#endif
CU_FAILED(cu->cuCtxCreate(&enc->cu_ctx, 0, device))
CU_FAILED(cu->cuCtxPopCurrent(NULL))
return true;
}
void cuda_ctx_free(struct nvenc_data *enc)
{
if (enc->cu_ctx) {
cu->cuCtxPopCurrent(NULL);
cu->cuCtxDestroy(enc->cu_ctx);
}
}
/* ------------------------------------------------------------------------- */
/* CUDA Surface management */
static bool cuda_surface_init(struct nvenc_data *enc,
struct nv_cuda_surface *nvsurf)
{
const bool p010 = obs_p010_tex_active();
CUDA_ARRAY3D_DESCRIPTOR desc;
desc.Width = enc->cx;
desc.Height = enc->cy;
desc.Depth = 0;
desc.Flags = CUDA_ARRAY3D_SURFACE_LDST;
desc.NumChannels = 1;
if (!enc->non_texture) {
desc.Format = p010 ? CU_AD_FORMAT_UNSIGNED_INT16
: CU_AD_FORMAT_UNSIGNED_INT8;
desc.Height = enc->cy + enc->cy / 2;
} else {
switch (enc->surface_format) {
case NV_ENC_BUFFER_FORMAT_NV12:
desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
// Additional half-height plane for UV data
desc.Height += enc->cy / 2;
break;
case NV_ENC_BUFFER_FORMAT_YUV420_10BIT:
desc.Format = CU_AD_FORMAT_UNSIGNED_INT16;
desc.Height += enc->cy / 2;
desc.NumChannels = 2; // number of bytes per element
break;
case NV_ENC_BUFFER_FORMAT_YUV444:
desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
desc.Height *= 3; // 3 full-size planes
break;
default:
error("Unknown input format: %d", enc->surface_format);
return false;
}
}
CU_FAILED(cu->cuArray3DCreate(&nvsurf->tex, &desc))
NV_ENC_REGISTER_RESOURCE res = {0};
res.version = NV_ENC_REGISTER_RESOURCE_VER;
res.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDAARRAY;
res.resourceToRegister = (void *)nvsurf->tex;
res.width = enc->cx;
res.height = enc->cy;
res.pitch = (uint32_t)(desc.Width * desc.NumChannels);
if (!enc->non_texture) {
res.bufferFormat = p010 ? NV_ENC_BUFFER_FORMAT_YUV420_10BIT
: NV_ENC_BUFFER_FORMAT_NV12;
} else {
res.bufferFormat = enc->surface_format;
}
if (NV_FAILED(nv.nvEncRegisterResource(enc->session, &res))) {
return false;
}
nvsurf->res = res.registeredResource;
nvsurf->mapped_res = NULL;
return true;
}
bool cuda_init_surfaces(struct nvenc_data *enc)
{
switch (enc->in_format) {
case VIDEO_FORMAT_P010:
enc->surface_format = NV_ENC_BUFFER_FORMAT_YUV420_10BIT;
break;
case VIDEO_FORMAT_I444:
enc->surface_format = NV_ENC_BUFFER_FORMAT_YUV444;
break;
default:
enc->surface_format = NV_ENC_BUFFER_FORMAT_NV12;
}
da_reserve(enc->surfaces, enc->buf_count);
CU_FAILED(cu->cuCtxPushCurrent(enc->cu_ctx))
for (uint32_t i = 0; i < enc->buf_count; i++) {
struct nv_cuda_surface buf;
if (!cuda_surface_init(enc, &buf)) {
return false;
}
da_push_back(enc->surfaces, &buf);
}
CU_FAILED(cu->cuCtxPopCurrent(NULL))
return true;
}
static void cuda_surface_free(struct nvenc_data *enc,
struct nv_cuda_surface *nvsurf)
{
if (nvsurf->res) {
if (nvsurf->mapped_res) {
nv.nvEncUnmapInputResource(enc->session,
nvsurf->mapped_res);
}
nv.nvEncUnregisterResource(enc->session, nvsurf->res);
cu->cuArrayDestroy(nvsurf->tex);
}
}
void cuda_free_surfaces(struct nvenc_data *enc)
{
if (!enc->cu_ctx)
return;
cu->cuCtxPushCurrent(enc->cu_ctx);
for (size_t i = 0; i < enc->surfaces.num; i++) {
cuda_surface_free(enc, &enc->surfaces.array[i]);
}
cu->cuCtxPopCurrent(NULL);
}
/* ------------------------------------------------------------------------- */
/* Actual encoding stuff */
static inline bool copy_frame(struct nvenc_data *enc,
struct encoder_frame *frame,
struct nv_cuda_surface *surf)
{
bool success = true;
size_t height = enc->cy;
size_t width = enc->cx;
CUDA_MEMCPY2D m = {0};
m.srcMemoryType = CU_MEMORYTYPE_HOST;
m.dstMemoryType = CU_MEMORYTYPE_ARRAY;
m.dstArray = surf->tex;
m.WidthInBytes = width;
m.Height = height;
CU_FAILED(cu->cuCtxPushCurrent(enc->cu_ctx))
if (enc->surface_format == NV_ENC_BUFFER_FORMAT_NV12) {
/* Page-locks the host memory so that it can be DMAd directly
* rather than CUDA doing an internal copy to page-locked
* memory before actually DMA-ing to the GPU. */
CU_CHECK(cu->cuMemHostRegister(frame->data[0],
frame->linesize[0] * height, 0))
CU_CHECK(cu->cuMemHostRegister(
frame->data[1], frame->linesize[1] * height / 2, 0))
m.srcPitch = frame->linesize[0];
m.srcHost = frame->data[0];
CU_FAILED(cu->cuMemcpy2D(&m))
m.srcPitch = frame->linesize[1];
m.srcHost = frame->data[1];
m.dstY += height;
m.Height /= 2;
CU_FAILED(cu->cuMemcpy2D(&m))
} else if (enc->surface_format == NV_ENC_BUFFER_FORMAT_YUV420_10BIT) {
CU_CHECK(cu->cuMemHostRegister(frame->data[0],
frame->linesize[0] * height, 0))
CU_CHECK(cu->cuMemHostRegister(
frame->data[1], frame->linesize[1] * height / 2, 0))
// P010 lines are double the size (16 bit per pixel)
m.WidthInBytes *= 2;
m.srcPitch = frame->linesize[0];
m.srcHost = frame->data[0];
CU_FAILED(cu->cuMemcpy2D(&m))
m.srcPitch = frame->linesize[1];
m.srcHost = frame->data[1];
m.dstY += height;
m.Height /= 2;
CU_FAILED(cu->cuMemcpy2D(&m))
} else { // I444
CU_CHECK(cu->cuMemHostRegister(frame->data[0],
frame->linesize[0] * height, 0))
CU_CHECK(cu->cuMemHostRegister(frame->data[1],
frame->linesize[1] * height, 0))
CU_CHECK(cu->cuMemHostRegister(frame->data[2],
frame->linesize[2] * height, 0))
m.srcPitch = frame->linesize[0];
m.srcHost = frame->data[0];
CU_FAILED(cu->cuMemcpy2D(&m))
m.srcPitch = frame->linesize[1];
m.srcHost = frame->data[1];
m.dstY += height;
CU_FAILED(cu->cuMemcpy2D(&m))
m.srcPitch = frame->linesize[2];
m.srcHost = frame->data[2];
m.dstY += height;
CU_FAILED(cu->cuMemcpy2D(&m))
}
unmap:
if (frame->data[0])
cu->cuMemHostUnregister(frame->data[0]);
if (frame->data[1])
cu->cuMemHostUnregister(frame->data[1]);
if (frame->data[2])
cu->cuMemHostUnregister(frame->data[2]);
CU_FAILED(cu->cuCtxPopCurrent(NULL))
return success;
}
bool cuda_encode(void *data, struct encoder_frame *frame,
struct encoder_packet *packet, bool *received_packet)
{
struct nvenc_data *enc = data;
struct nv_cuda_surface *surf;
struct nv_bitstream *bs;
bs = &enc->bitstreams.array[enc->next_bitstream];
surf = &enc->surfaces.array[enc->next_bitstream];
deque_push_back(&enc->dts_list, &frame->pts, sizeof(frame->pts));
/* ------------------------------------ */
/* copy to CUDA surface */
if (!copy_frame(enc, frame, surf))
return false;
/* ------------------------------------ */
/* map output tex so nvenc can use it */
NV_ENC_MAP_INPUT_RESOURCE map = {NV_ENC_MAP_INPUT_RESOURCE_VER};
map.registeredResource = surf->res;
map.mappedBufferFmt = enc->surface_format;
if (NV_FAILED(nv.nvEncMapInputResource(enc->session, &map)))
return false;
surf->mapped_res = map.mappedResource;
/* ------------------------------------ */
/* do actual encode call */
return nvenc_encode_base(enc, bs, surf->mapped_res, frame->pts, packet,
received_packet);
}