Finally after some period of time I had time to sit down at this problem again, and finally I've got the solution that satisfies me. I went on with RTP interleaved stream (RTP is interleaved with RTCP over single TCP connection).
So I had a interleaved RTCP/RTP stream that needed to be disassembled to Audio (PCM A-Law) and Video (h.264 Constrained baseline) RTP packets.
The decomposition of the RTSP stream containing RTP data is described here rfc2326.
Depacketization of the H264 is described here rfc6184, for the PCM A-Law the frames came out to be raw audio in RTP so no depacketization was necessary.
Next step was to calculate proper PTS (or presentation time stamp) for each stream, that was a bit of a hassle but finally the Live555 code came to help
(see RTP lipsync synchronization).
The last task was to mux it into a container that would support PCM alaw, I've used ffmpeg's avlibraries.
There are many examples over the Internet but many of them are outdated (ffmpeg is very 'dynamic' in API changes region) so I'm posting (most important parts of) what actually worked for me in the end:
The setup part:
#include <libavcodec/avcodec.h>
#include <libavformat/avformat.h>
#include "libavutil/intreadwrite.h"
#include "libavutil/mathematics.h"
AVFormatContext *formatContext;
AVOutputFormat *outputFormat;
AVStream *video_st;
AVStream *audio_st;
AVCodec *av_encode_codec = NULL;
AVCodec *av_audio_encode_codec = NULL;
AVCodecContext *av_video_encode_codec_ctx = NULL;
AVCodecContext *av_audio_encode_codec_ctx = NULL;
av_register_all();
av_log_set_level(AV_LOG_TRACE);
outputFormat = av_guess_format(NULL, pu8outFileName, NULL);
outputFormat->video_codec = AV_CODEC_ID_H264;
av_encode_codec = avcodec_find_encoder(AV_CODEC_ID_H264);
av_audio_encode_codec = avcodec_find_encoder(AV_CODEC_ID_PCM_ALAW);
avformat_alloc_output_context2(&formatContext, NULL, NULL, pu8outFileName);
formatContext->oformat = outputFormat;
strcpy(formatContext->filename, pu8outFileName);
outputFormat->audio_codec = AV_CODEC_ID_PCM_ALAW;
av_video_encode_codec_ctx = avcodec_alloc_context3(av_encode_codec);
av_audio_encode_codec_ctx = avcodec_alloc_context3(av_audio_encode_codec);
av_video_encode_codec_ctx->codec_id = outputFormat->video_codec;
av_video_encode_codec_ctx->codec_type = AVMEDIA_TYPE_VIDEO;
av_video_encode_codec_ctx->bit_rate = 4000;
av_video_encode_codec_ctx->width = u32width;
av_video_encode_codec_ctx->height = u32height;
av_video_encode_codec_ctx->time_base = (AVRational){ 1, u8fps };
av_video_encode_codec_ctx->max_b_frames = 0;
av_video_encode_codec_ctx->pix_fmt = AV_PIX_FMT_YUV420P;
av_audio_encode_codec_ctx->sample_fmt = AV_SAMPLE_FMT_S16;
av_audio_encode_codec_ctx->codec_id = AV_CODEC_ID_PCM_ALAW;
av_audio_encode_codec_ctx->codec_type = AVMEDIA_TYPE_AUDIO;
av_audio_encode_codec_ctx->sample_rate = 8000;
av_audio_encode_codec_ctx->channels = 1;
av_audio_encode_codec_ctx->time_base = (AVRational){ 1, u8fps };
av_audio_encode_codec_ctx->channel_layout = AV_CH_LAYOUT_MONO;
video_st = avformat_new_stream(formatContext, av_encode_codec);
audio_st = avformat_new_stream(formatContext, av_audio_encode_codec);
audio_st->index = 1;
video_st->avg_frame_rate = (AVRational){ 90000, 90000 / u8fps };
av_stream_set_r_frame_rate(video_st, (AVRational){ 90000, 90000 / u8fps });
The packets for video are written like this:
uint8_t *pu8framePtr = video_frame;
AVPacket pkt = { 0 };
av_init_packet(&pkt);
if (0x65 == pu8framePtr[4] || 0x67 == pu8framePtr[4] || 0x68 == pu8framePtr[4])
{
pkt.flags = AV_PKT_FLAG_KEY;
}
pkt.data = (uint8_t *)pu8framePtr;
pkt.size = u32LastFrameSize;
pkt.pts = av_rescale_q(s_video_sync.fSyncTime.tv_sec * 1000000 + s_video_sync.fSyncTime.tv_usec, (AVRational){ 1, 1000000 }, video_st->time_base);
pkt.dts = pkt.pts;
pkt.stream_index = video_st->index;
av_interleaved_write_frame(formatContext, &pkt);
av_packet_unref(&pkt);
and for the audio like this:
AVPacket pkt = { 0 };
av_init_packet(&pkt);
pkt.flags = AV_PKT_FLAG_KEY;
pkt.data = (uint8_t *)pu8framePtr;
pkt.size = u32AudioDataLen;
pkt.pts = av_rescale_q(s_audio_sync.fSyncTime.tv_sec * 1000000 + s_audio_sync.fSyncTime.tv_usec, (AVRational){ 1, 1000000 }, audio_st->time_base);
pkt.dts = pkt.pts;
pkt.stream_index = audio_st->index;
if (u8FirstIFrameFound) {av_interleaved_write_frame(formatContext, &pkt);}
av_packet_unref(&pkt)
and at the end some deinits:
av_write_trailer(formatContext);
av_dump_format(formatContext, 0, pu8outFileName, 1);
avcodec_free_context(&av_video_encode_codec_ctx);
avcodec_free_context(&av_audio_encode_codec_ctx);
avio_closep(&formatContext->pb);
avformat_free_context(formatContext);