Encoding of raw frames (D3D11Texture2D) to an rtsp stream using libav*

Question

I have managed to create a rtsp stream using libav* and directX texture (which I am obtaining from GDI API using Bitblit method). Here's my approach for creating live rtsp stream:

Create output context and stream (skipping the checks here)
- avformat_alloc_output_context2(&ofmt_ctx, NULL, "rtsp", rtsp_url); //RTSP
- vid_codec = avcodec_find_encoder(ofmt_ctx->oformat->video_codec);
- vid_stream = avformat_new_stream(ofmt_ctx,vid_codec);
- vid_codec_ctx = avcodec_alloc_context3(vid_codec);

Set codec params

codec_ctx->codec_tag = 0;
codec_ctx->codec_id = ofmt_ctx->oformat->video_codec;
//codec_ctx->codec_type = AVMEDIA_TYPE_VIDEO;
codec_ctx->width = width;   codec_ctx->height = height;
codec_ctx->gop_size = 12;
 //codec_ctx->gop_size = 40;
 //codec_ctx->max_b_frames = 3;
codec_ctx->pix_fmt = target_pix_fmt; // AV_PIX_FMT_YUV420P
codec_ctx->framerate = { stream_fps, 1 };
codec_ctx->time_base = { 1, stream_fps};
if (fctx->oformat->flags & AVFMT_GLOBALHEADER)
 {
     codec_ctx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
 }

Initialize video stream

if (avcodec_parameters_from_context(stream->codecpar, codec_ctx) < 0)
{
 Debug::Error("Could not initialize stream codec parameters!");
 return false;
}

AVDictionary* codec_options = nullptr;
if (codec->id == AV_CODEC_ID_H264) {
 av_dict_set(&codec_options, "profile", "high", 0);
 av_dict_set(&codec_options, "preset", "fast", 0);
 av_dict_set(&codec_options, "tune", "zerolatency", 0);
}
// open video encoder
int ret = avcodec_open2(codec_ctx, codec, &codec_options);
if (ret<0) {
 Debug::Error("Could not open video encoder: ", avcodec_get_name(codec->id), " error ret: ", AVERROR(ret));
 return false;
}

stream->codecpar->extradata = codec_ctx->extradata;
stream->codecpar->extradata_size = codec_ctx->extradata_size;

Start streaming

// Create new frame and allocate buffer
AVFrame* AllocateFrameBuffer(AVCodecContext* codec_ctx, double width, double height)
{
 AVFrame* frame = av_frame_alloc();
 std::vector<uint8_t> framebuf(av_image_get_buffer_size(codec_ctx->pix_fmt, width, height, 1));
 av_image_fill_arrays(frame->data, frame->linesize, framebuf.data(), codec_ctx->pix_fmt, width, height, 1);
 frame->width = width;
 frame->height = height;
 frame->format = static_cast<int>(codec_ctx->pix_fmt);
 //Debug::Log("framebuf size: ", framebuf.size(), "  frame format: ", frame->format);
 return frame;
}

void RtspStream(AVFormatContext* ofmt_ctx, AVStream* vid_stream, AVCodecContext* vid_codec_ctx, char* rtsp_url)
{
 printf("Output stream info:\n");
 av_dump_format(ofmt_ctx, 0, rtsp_url, 1);

 const int width = WindowManager::Get().GetWindow(RtspStreaming::WindowId())->GetTextureWidth();
 const int height = WindowManager::Get().GetWindow(RtspStreaming::WindowId())->GetTextureHeight();

 //DirectX BGRA to h264 YUV420p
 SwsContext* conversion_ctx = sws_getContext(width, height, src_pix_fmt,
     vid_stream->codecpar->width, vid_stream->codecpar->height, target_pix_fmt, 
     SWS_BICUBIC | SWS_BITEXACT, nullptr, nullptr, nullptr);
if (!conversion_ctx)
{
     Debug::Error("Could not initialize sample scaler!");
     return;
}

 AVFrame* frame = AllocateFrameBuffer(vid_codec_ctx,vid_codec_ctx->width,vid_codec_ctx->height);
 if (!frame) {
     Debug::Error("Could not allocate video frame\n");
     return;
 }


 if (avformat_write_header(ofmt_ctx, NULL) < 0) {
     Debug::Error("Error occurred when writing header");
     return;
 }
 if (av_frame_get_buffer(frame, 0) < 0) {
     Debug::Error("Could not allocate the video frame data\n");
     return;
 }

 int frame_cnt = 0;
 //av start time in microseconds
 int64_t start_time_av = av_gettime();
 AVRational time_base = vid_stream->time_base;
 AVRational time_base_q = { 1, AV_TIME_BASE };

 // frame pixel data info
 int data_size = width * height * 4;
 uint8_t* data = new uint8_t[data_size];
//    AVPacket* pkt = av_packet_alloc();

 while (RtspStreaming::IsStreaming())
 {
     /* make sure the frame data is writable */
     if (av_frame_make_writable(frame) < 0)
     {
         Debug::Error("Can't make frame writable");
         break;
     }

     //get copy/ref of the texture
     //uint8_t* data = WindowManager::Get().GetWindow(RtspStreaming::WindowId())->GetBuffer();
     if (!WindowManager::Get().GetWindow(RtspStreaming::WindowId())->GetPixels(data, 0, 0, width, height))
     {
         Debug::Error("Failed to get frame buffer. ID: ", RtspStreaming::WindowId());
         std::this_thread::sleep_for (std::chrono::seconds(2));
         continue;
     }
     //printf("got pixels data\n");
     // convert BGRA to yuv420 pixel format
     int srcStrides[1] = { 4 * width };
     if (sws_scale(conversion_ctx, &data, srcStrides, 0, height, frame->data, frame->linesize) < 0)
     {
         Debug::Error("Unable to scale d3d11 texture to frame. ", frame_cnt);
         break;
     }
     //Debug::Log("frame pts: ", frame->pts, "  time_base:", av_rescale_q(1, vid_codec_ctx->time_base, vid_stream->time_base));
     frame->pts = frame_cnt++; 
     //frame_cnt++;
     //printf("scale conversion done\n");

     //encode to the video stream
     int ret = avcodec_send_frame(vid_codec_ctx, frame);
     if (ret < 0)
     {
         Debug::Error("Error sending frame to codec context! ",frame_cnt);
         break;
     }

     AVPacket* pkt = av_packet_alloc();
     //av_init_packet(pkt);
     ret = avcodec_receive_packet(vid_codec_ctx, pkt);
     if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF)
     {
         //av_packet_unref(pkt);
         av_packet_free(&pkt);
         continue;
     }
     else if (ret < 0)
     {
         Debug::Error("Error during receiving packet: ",AVERROR(ret));
         //av_packet_unref(pkt);
         av_packet_free(&pkt);
         break;
     }

     if (pkt->pts == AV_NOPTS_VALUE)
     {
         //Write PTS
         //Duration between 2 frames (us)
         int64_t calc_duration = (double)AV_TIME_BASE / av_q2d(vid_stream->r_frame_rate);
         //Parameters
         pkt->pts = (double)(frame_cnt * calc_duration) / (double)(av_q2d(time_base) * AV_TIME_BASE);
         pkt->dts = pkt->pts;
         pkt->duration = (double)calc_duration / (double)(av_q2d(time_base) * AV_TIME_BASE);
     }
     int64_t pts_time = av_rescale_q(pkt->dts, time_base, time_base_q);
     int64_t now_time = av_gettime() - start_time_av;

     if (pts_time > now_time)
         av_usleep(pts_time - now_time);

     //pkt.pts = av_rescale_q_rnd(pkt.pts, in_stream->time_base, out_stream->time_base, (AVRounding)(AV_ROUND_NEAR_INF | AV_ROUND_PASS_MINMAX));
     //pkt.dts = av_rescale_q_rnd(pkt.dts, in_stream->time_base, out_stream->time_base, (AVRounding)(AV_ROUND_NEAR_INF | AV_ROUND_PASS_MINMAX));
     //pkt.duration = av_rescale_q(pkt.duration, in_stream->time_base, out_stream->time_base);
     //pkt->pos = -1;

     //write frame and send
     if (av_interleaved_write_frame(ofmt_ctx, pkt)<0)
     {
         Debug::Error("Error muxing packet, frame number:",frame_cnt);
         break;
     }

     //Debug::Log("RTSP streaming...");
     //sstd::this_thread::sleep_for(std::chrono::milliseconds(1000/20));
     //av_packet_unref(pkt);
     av_packet_free(&pkt);
 }

 //av_free_packet(pkt);
 delete[] data;

 /* Write the trailer, if any. The trailer must be written before you
  * close the CodecContexts open when you wrote the header; otherwise
  * av_write_trailer() may try to use memory that was freed on
  * av_codec_close(). */
 av_write_trailer(ofmt_ctx);
 av_frame_unref(frame);
 av_frame_free(&frame);
 printf("streaming thread CLOSED!\n");
}

Now, this allows me to connect to my rtsp server and maintain the connection. However, on the rtsp client side I am getting either gray or single static frame as shown below:

Would appreciate if you can help with following questions:

Firstly, why the stream is not working in spite of continued connection to the server and updating frames?
Video codec. By default rtsp format uses Mpeg4 codec, is it possible to use h264? When I manually set it to AV_CODEC_ID_H264 the program fails at avcodec_open2 with return value of -22.
Do I need to create and allocate new "AVFrame" and "AVPacket" for every frame? Or can I just reuse global variable for this?
Do I need to explicitly define some code for real-time streaming? (Like in ffmpeg we use "-re" flag).

Would be great if you can point out some example code for creating livestream. I have checked following resources:

Update

While test I found that I am able to play the stream using ffplay, while it's getting stuck on VLC player. Here is snapshot on the ffplay log

shubhamrock828 shubhamrock828 · Accepted Answer · 2020-12-13T15:03:11

The basic construct and initialization seems to be okay. Find below responses to your questions

why the stream is not working in spite of continued connection to the server and updating frames?

If you're getting an error or broken stream, you might wanna check into your presentation and decompression timestamps (pts/dts) of your packet.

In your code, I notice that you're taking time_base from video stream object which is not guranteed to be same as codec->time_base value and usually varies depending upon active stream.

 AVRational time_base = vid_stream->time_base;
 AVRational time_base_q = { 1, AV_TIME_BASE };

Video codec. By default rtsp format uses Mpeg4 codec, is it possible to use h264?

I don't see why not... RTSP is just a protocol for carrying your packets over the network. So you should be able use AV_CODEC_ID_H264 for encoding the stream.

Do I need to create and allocate new "AVFrame" and "AVPacket" for every frame? Or can I just reuse global variable for this?

In libav during encoding process a single packet is used for encoding a video frame, while there can be multiple audio frames in a single packet. I should reference this, but can't seem to find any source at the moment. But anyways the point is you would need to create new packet every time.

Do I need to explicitly define some code for real-time streaming? (Like in ffmpeg we use "-re" flag).

You don't need to add anything else for real time streaming. Although you might wanna implement it to limit the number of frame updates that you pass to encoder and save some performance.

Encoding of raw frames (D3D11Texture2D) to an rtsp stream using libav*

2 Answers