
I am learning how to create MP4 video from this example.The problem is that the example demonstrates audio encoding from some dummy source data generated on the fly.I need to encode audio from a file.I have checked many examples and most of them show the same or just a separate audio encoding. In my trial and error process I am using the same AVFormatContext for both audio and video frames.I am not sure if it's right thing to do, or should I rather have 2 separate contexts?So far I got Video encoding ok but audio stream fails as AVPacket can't locate correct audio stream index. Here is how I setup audio stream:

  void open_audio(AVFormatContext *oc, AVCodec **codec, AVStream **st ,enum AVCodecID codec_id){

    //    AVCodecContext *c;
    int ret;
    //    c = st->codec;

    *codec = avcodec_find_encoder(codec_id);
    if (!(*codec)) {
        fprintf(stderr, "Could not find encoder for '%s'\n",avcodec_get_name(codec_id));

    /* open it */

    if(avformat_open_input(&oc,_audioInName.c_str(),NULL,NULL) !=0){

        Msg::PrintErrorMsg("Error opening audio file");


    AVStream* audioStream = NULL;

    // Find the audio stream (some container files can have multiple streams in them)

    for (uint32_t i = 0; i < oc->nb_streams; ++i)


        if (oc->streams[i]->codec->codec_type == AVMEDIA_TYPE_AUDIO)


            audioStream = oc->streams[i];




    if (audioStream == NULL)
        Msg::PrintErrorMsg("Could not find any audio stream in the file");

    *st =audioStream;

    AVCodecContext *c  = audioStream->codec;
    c->codec = *codec;//avcodec_find_decoder(c->codec_id);
    audioStream->id = 1;
    c->sample_fmt  = AV_SAMPLE_FMT_S16;
    c->bit_rate    = 64000;
    c->sample_rate = 44100;
    c->channels    = 1;

    if (oc->oformat->flags & AVFMT_GLOBALHEADER){
        c->flags |= CODEC_FLAG_GLOBAL_HEADER;


    if (c->codec == NULL)
        Msg::PrintErrorMsg("Couldn't find a proper decoder");


    ret = avcodec_open2(c, *codec, NULL);
    if (ret < 0) {

        Msg::PrintErrorMsg("Could not open audio codec\n");



Here "oc" is the same context used to initialize video stream as well.

Then I am trying to write audio frame like this:

  void write_audio_frame(AVFormatContext *oc, AVStream *st){
    AVCodecContext *c;
    AVPacket pkt = { 0 }; // data and size must be 0;
    AVFrame *frame = avcodec_alloc_frame();
    int got_packet, ret;
    c = st->codec;
    //  get_audio_frame(samples, audio_input_frame_size, c->channels);

    ////Read the packet:
    while(av_read_frame(oc,&pkt) == 0 ){

        if(pkt.stream_index ==st->index){

        // Try to decode the packet into a frame
        int frameFinished = 0;
        avcodec_decode_audio4(c, frame, &frameFinished, &pkt);

        // Some frames rely on multiple packets, so we have to make sure the frame is finished before
        // we can use it
        if (frameFinished){
            ret = avcodec_encode_audio2(c, &pkt, frame, &got_packet);
            if (ret < 0) {
                Msg::PrintErrorMsg("Error encoding audio frame\n");

            if (!got_packet){
                printf("failed to aquire packet");
            pkt.stream_index = st->index;
            /* Write the compressed frame to the media file. */
            ret = av_interleaved_write_frame(oc, &pkt);
            if (ret != 0) {

                Msg::PrintErrorMsg("Error while writing audio frame.");



The thing is I never pass this statement: "if(pkt.stream_index ==st->index)".Packet stream index is never equal to the audio stream index.Anyone can point out where I am wrong?


I did managed to open input audio stream for encoding but I can't encode audio and video streams into single output.From what I see PTS and DTS are probably the source of the problem.Currently I calculate pts based on muxing.c example but it doesn't work for audio at all.

Here is how I use it :

   while(frame_count < _streamDurationNBFrames-1){

        uint8_t *frameToWrite =_frames.front();

        // Compute current audio and video time. ///

        if (audio_st){
            audio_pts = (double)audioIn_st->pts.val * audioIn_st->time_base.num / audioIn_st->time_base.den;

            audio_pts = 0.0;
        if (video_st){

            video_pts = (double)video_st->pts.val * video_st->time_base.num /   video_st->time_base.den;

            video_pts = 0.0;

        if ((!audio_st || audio_pts >= _streamDuration) && (!video_st || video_pts >= _streamDuration)){



        if (audio_st && audio_pts < video_pts) {
            av_read_frame(informat, &pkt);//read audio from input stream
             Msg::PrintMsg("Encode audio here...");

          //==================   AUDIO ENCODE HERE   

           outpkt.data = pkt.data;
           outpkt.size = pkt.size;
           outpkt.stream_index = pkt.stream_index;
           outpkt.flags |= AV_PKT_FLAG_KEY;
           outpkt.pts = pkt.pts;
           outpkt.dts =pkt.dts;
           if(av_interleaved_write_frame(oc, &outpkt) < 0)
            Msg::PrintErrorMsg("Fail Audio Write ");

          //==================   VIDEO  ENCODE HERE   

            write_video_frame(oc, video_st,frameToWrite);

            frame->pts += av_rescale_q(1, video_st->codec->time_base, video_st->time_base);

        ///at last delete this frame:
        delete frameToWrite; ///deallocate the written frame!

Somehow ,once I am in the audio encoding loop the audio_pts never reaches the video_pts and is always zero:

audio_pts = (double)audio_st->pts.val * audio_st->time_base.num / audio_st->time_base.den; is always zero because  (double)audio_st->pts.val  returns zero.

So basically I am asking the same question again:How to do the muxing when audio comes from external file?

Btw,the answer below doesn't help as it presumes both audio and video streams come from the same file, whereas in my case only audio comes from the external source.


2 Answers


You can make you of the same context there is no need for two separate context. If you are encoding both video and audio. Then you will first need to create video stream and then audio stream. However if you want to just encode only audio then you will need to create just audio stream. if(pkt.stream_index ==st->index) is normally required when you are transcoding i.e when you are change container format. In which can you will be reading frame from a video file and write to another file, so you need to know if the frame is from audio or video stream. However if you are getting decoded audio packets then you will need to set proper stream index in audio packet before do av_interleaved_write.

In your code you are not setting pts and dts of audio packets which are required for proper encoding.

Sometime ago I've written a similar program like your, you can look into it for your reference.

int VideoClipper::Init(const wxString& filename)
    int ret = 0;
    char errbuf[64];

    if ((ret = avformat_open_input( &m_informat, filename.mb_str(), 0, 0)) != 0 )
        PRINT_VAL("Not able to Open file;; ", errbuf)
        ret = -1;
        return ret;
        PRINT_MSG("Opened File ")

    if ((ret = avformat_find_stream_info(m_informat, 0))< 0 )

        PRINT_VAL("Not Able to find stream info:: ", errbuf)
        ret = -1;
        return ret;
        PRINT_MSG("Got stream Info ")

    for(unsigned int i = 0; i<m_informat->nb_streams; i++)
        if(m_informat->streams[i]->codec->codec_type == AVMEDIA_TYPE_VIDEO)

            PRINT_MSG("Found Video Stream ")
            m_in_vid_strm_idx = i;
            m_in_vid_strm = m_informat->streams[i];

        if(m_informat->streams[i]->codec->codec_type == AVMEDIA_TYPE_AUDIO)
            PRINT_MSG("Found Audio Stream ")
            m_in_aud_strm_idx = i;
            m_in_aud_strm = m_informat->streams[i];

    if(m_in_aud_strm_idx == -1 && m_in_vid_strm_idx == -1)
       ret = -1;    

    if(m_informat->duration == AV_NOPTS_VALUE)
        if(m_in_vid_strm_idx != -1 && m_informat->streams[m_in_vid_strm_idx])
            if(m_informat->streams[m_in_vid_strm_idx]->duration != AV_NOPTS_VALUE)
                //m_in_end_time = (m_informat->streams[m_in_vid_strm_idx]->duration)/(AV_TIME_BASE);
                m_in_end_time = (m_informat->streams[m_in_vid_strm_idx]->duration)/(m_informat->streams[m_in_vid_strm_idx]->time_base.den/m_informat->streams[m_in_vid_strm_idx]->time_base.num);


        else if(m_in_aud_strm_idx != -1 && m_informat->streams[m_in_aud_strm_idx])
            if(m_informat->streams[m_in_aud_strm_idx]->duration != AV_NOPTS_VALUE)
                m_in_end_time = (m_informat->streams[m_in_aud_strm_idx]->duration)/(AV_TIME_BASE);
        m_in_end_time = (m_informat->duration)/(AV_TIME_BASE);

    if(m_in_vid_strm_idx != -1 && m_informat->streams[m_in_vid_strm_idx])
        if(m_informat->streams[m_in_vid_strm_idx]->r_frame_rate.num != AV_NOPTS_VALUE && m_informat->streams[m_in_vid_strm_idx]->r_frame_rate.den != 0)
            m_fps =  (m_informat->streams[m_in_vid_strm_idx]->r_frame_rate.num)/ (m_informat->streams[m_in_vid_strm_idx]->r_frame_rate.den);
        m_fps = 25;    
    AVOutputFormat *outfmt = NULL;
    std::string outfile = std::string(filename) + "clip_out.avi";
    outfmt = av_guess_format(NULL,outfile.c_str(),NULL);

    if(outfmt == NULL)
        ret = -1;
        return ret;
        m_outformat = avformat_alloc_context();
            m_outformat->oformat = outfmt;
            _snprintf(m_outformat->filename, sizeof(m_outformat->filename), "%s", outfile.c_str());    
            ret = -1;
            return ret;

    AVCodec *out_vid_codec,*out_aud_codec;
    out_vid_codec = out_aud_codec = NULL;

    if(outfmt->video_codec != AV_CODEC_ID_NONE && m_in_vid_strm != NULL)
        out_vid_codec = avcodec_find_encoder(outfmt->video_codec);
        if(NULL == out_vid_codec)
            PRINT_MSG("Could Not Find Vid Encoder")
            ret = -1;
            return ret;
            PRINT_MSG("Found Out Vid Encoder ")
            m_out_vid_strm = avformat_new_stream(m_outformat, out_vid_codec);
            if(NULL == m_out_vid_strm)
                 PRINT_MSG("Failed to Allocate Output Vid Strm ")
                 ret = -1;
                 return ret;
                 PRINT_MSG("Allocated Video Stream ")
                 if(avcodec_copy_context(m_out_vid_strm->codec, m_informat->streams[m_in_vid_strm_idx]->codec) != 0)
                    PRINT_MSG("Failed to Copy Context ")
                    ret = -1;
                    return ret;
                    m_out_vid_strm->sample_aspect_ratio.den = m_out_vid_strm->codec->sample_aspect_ratio.den;
                    m_out_vid_strm->sample_aspect_ratio.num = m_in_vid_strm->codec->sample_aspect_ratio.num;
                    PRINT_MSG("Copied Context ")
                    m_out_vid_strm->codec->codec_id = m_in_vid_strm->codec->codec_id;
                    m_out_vid_strm->codec->time_base.num = 1;
                    m_out_vid_strm->codec->time_base.den = m_fps*(m_in_vid_strm->codec->ticks_per_frame);         
                    m_out_vid_strm->time_base.num = 1;
                    m_out_vid_strm->time_base.den = 1000;
                    m_out_vid_strm->r_frame_rate.num = m_fps;
                    m_out_vid_strm->r_frame_rate.den = 1;
                    m_out_vid_strm->avg_frame_rate.den = 1;
                    m_out_vid_strm->avg_frame_rate.num = m_fps;
                    m_out_vid_strm->duration = (m_out_end_time - m_out_start_time)*1000;

    if(outfmt->audio_codec != AV_CODEC_ID_NONE && m_in_aud_strm != NULL)
        out_aud_codec = avcodec_find_encoder(outfmt->audio_codec);
        if(NULL == out_aud_codec)
            PRINT_MSG("Could Not Find Out Aud Encoder ")
            ret = -1;
            return ret;
            PRINT_MSG("Found Out Aud Encoder ")
            m_out_aud_strm = avformat_new_stream(m_outformat, out_aud_codec);
            if(NULL == m_out_aud_strm)
                PRINT_MSG("Failed to Allocate Out Vid Strm ")
                ret = -1;
                return ret;
                if(avcodec_copy_context(m_out_aud_strm->codec, m_informat->streams[m_in_aud_strm_idx]->codec) != 0)
                    PRINT_MSG("Failed to Copy Context ")
                    ret = -1;
                    return ret;
                    PRINT_MSG("Copied Context ")
                    m_out_aud_strm->codec->codec_id = m_in_aud_strm->codec->codec_id;
                    m_out_aud_strm->codec->codec_tag = 0;
                    m_out_aud_strm->pts = m_in_aud_strm->pts;
                    m_out_aud_strm->duration = m_in_aud_strm->duration;
                    m_out_aud_strm->time_base.num = m_in_aud_strm->time_base.num;
                    m_out_aud_strm->time_base.den = m_in_aud_strm->time_base.den;


      if (!(outfmt->flags & AVFMT_NOFILE)) 
        if (avio_open2(&m_outformat->pb, outfile.c_str(), AVIO_FLAG_WRITE,NULL, NULL) < 0) 
                PRINT_VAL("Could Not Open File ", outfile)
                ret = -1;
                return ret;
        /* Write the stream header, if any. */
      if (avformat_write_header(m_outformat, NULL) < 0) 
            PRINT_VAL("Error Occurred While Writing Header ", outfile)
            ret = -1;
            return ret;
            PRINT_MSG("Written Output header ")
            m_init_done = true;

    return ret;

int VideoClipper::GenerateClip(void)
    AVPacket pkt, outpkt;
    int aud_pts = 0, vid_pts = 0, aud_dts = 0, vid_dts = 0;
    int last_vid_pts = 0;
        while(av_read_frame(m_informat, &pkt) >= 0 && (m_num_frames-- > 0))
            if(pkt.stream_index == m_in_vid_strm_idx)
                PRINT_VAL("ACTUAL VID Pkt PTS ",av_rescale_q(pkt.pts,m_in_vid_strm->time_base, m_in_vid_strm->codec->time_base))
                PRINT_VAL("ACTUAL VID Pkt DTS ", av_rescale_q(pkt.dts, m_in_vid_strm->time_base, m_in_vid_strm->codec->time_base ))
                if(pkt.pts != AV_NOPTS_VALUE)
                    if(last_vid_pts == vid_pts)
                        last_vid_pts = vid_pts;
                    outpkt.pts = vid_pts;   
                    PRINT_VAL("ReScaled VID Pts ", outpkt.pts)
                    outpkt.pts = AV_NOPTS_VALUE;

                if(pkt.dts == AV_NOPTS_VALUE)
                    outpkt.dts = AV_NOPTS_VALUE;
                    outpkt.dts = vid_pts;
                    PRINT_VAL("ReScaled VID Dts ", outpkt.dts)

                outpkt.data = pkt.data;
                outpkt.size = pkt.size;
                outpkt.stream_index = pkt.stream_index;
                outpkt.flags |= AV_PKT_FLAG_KEY;
                last_vid_pts = vid_pts;
                if(av_interleaved_write_frame(m_outformat, &outpkt) < 0)
                    PRINT_MSG("Failed Video Write ")
            else if(pkt.stream_index == m_in_aud_strm_idx)
                PRINT_VAL("ACTUAL AUD Pkt PTS ", av_rescale_q(pkt.pts, m_in_aud_strm->time_base, m_in_aud_strm->codec->time_base))
                PRINT_VAL("ACTUAL AUD Pkt DTS ", av_rescale_q(pkt.dts, m_in_aud_strm->time_base, m_in_aud_strm->codec->time_base))
                if(pkt.pts != AV_NOPTS_VALUE)
                    outpkt.pts = aud_pts;
                    PRINT_VAL("ReScaled AUD PTS ", outpkt.pts)
                    outpkt.pts = AV_NOPTS_VALUE;

                if(pkt.dts == AV_NOPTS_VALUE)
                    outpkt.dts = AV_NOPTS_VALUE;
                    outpkt.dts = aud_pts;
                    PRINT_VAL("ReScaled AUD DTS ", outpkt.dts)
                    if( outpkt.pts >= outpkt.dts)
                        outpkt.dts = outpkt.pts;
                    if(outpkt.dts == aud_dts)
                    if(outpkt.pts < outpkt.dts)
                        outpkt.pts = outpkt.dts;
                        aud_pts = outpkt.pts;

                outpkt.data = pkt.data;
                outpkt.size = pkt.size;
                outpkt.stream_index = pkt.stream_index;
                outpkt.flags |= AV_PKT_FLAG_KEY;
                vid_pts = aud_pts;
                if(av_interleaved_write_frame(m_outformat, &outpkt) < 0)
                    PRINT_MSG("Faile Audio Write ")
            PRINT_MSG("Got Unknown Pkt ")

    return 0;    
    return -1;

You may use AVBlocks library (example). I used the example code to solve the same problem.

For correct work not only with .aac I made some changes in StreamType::* and StreamSubType::*