
I'm trying to resample a decoded audio frame from 48KHz to 44.1KHz using the libswresample API. The code I have is the following:

// 'frame' is the original decoded audio frame
AVFrame *output_frame = av_frame_alloc();

// Without this, there is no sound at all at the output (PTS stuff I guess)
av_frame_copy_props(output_frame, frame);

output_frame->channel_layout = audioStream->codec->channel_layout;
output_frame->sample_rate = audioStream->codec->sample_rate;
output_frame->format = audioStream->codec->sample_fmt;

SwrContext *swr;
// Configure resampling context
swr = swr_alloc_set_opts(NULL,  // we're allocating a new context
                         AV_CH_LAYOUT_STEREO,  // out_ch_layout
                         AV_SAMPLE_FMT_FLTP,     // out_sample_fmt
                         44100,                // out_sample_rate
                         AV_CH_LAYOUT_STEREO,  // in_ch_layout
                         AV_SAMPLE_FMT_FLTP,   // in_sample_fmt
                         48000,                // in_sample_rate
                         0,                    // log_offset
                         NULL);                // log_ctx
// Initialize resampling context

// Perform conversion
swr_convert_frame(swr, output_frame, frame);

// Close resampling context
// Free the original frame and replace it with the new one
return output_frame;

With this code I'm able to hear the audio at the output but it is also noisy. From what I read, this code without the av_frame_copy_props() should be enough but it is not working for some reason. Any ideas?

EDIT: The input stream encodes the audio using AAC and the number of samples is 1024. But, after conversion, the number of samples is 925.

EDIT: I tried doing it in reverse. Since my app receives streams from any sources, some audio streams are 48KHz and some others 44.1KHz. So I tried resampling from 44.1 to 48 to avoid resampling loss. But now the frames has more than 1024 samples each one and the encoding fails.

EDIT: I tried using libavfilter instead with the following filter chain:

int init_filter_graph(AVStream *audio_st) {
// create new graph
filter_graph = avfilter_graph_alloc();
if (!filter_graph) {
    av_log(NULL, AV_LOG_ERROR, "unable to create filter graph: out of memory\n");
    return -1;

AVFilter *abuffer = avfilter_get_by_name("abuffer");
AVFilter *aformat = avfilter_get_by_name("aformat");
AVFilter *asetnsamples = avfilter_get_by_name("asetnsamples");
AVFilter *abuffersink = avfilter_get_by_name("abuffersink");

int err;
// create abuffer filter
AVCodecContext *avctx = audio_st->codec;
AVRational time_base = audio_st->time_base;
snprintf(strbuf, sizeof(strbuf),
         "time_base=%d/%d:sample_rate=%d:sample_fmt=%s:channel_layout=0x%" PRIx64,
         time_base.num, time_base.den, avctx->sample_rate,
fprintf(stderr, "abuffer: %s\n", strbuf);
err = avfilter_graph_create_filter(&abuffer_ctx, abuffer,
                                   NULL, strbuf, NULL, filter_graph);
if (err < 0) {
    av_log(NULL, AV_LOG_ERROR, "error initializing abuffer filter\n");
    return err;
// create aformat filter
snprintf(strbuf, sizeof(strbuf),
         "sample_fmts=%s:sample_rates=%d:channel_layouts=0x%" PRIx64,
         av_get_sample_fmt_name(AV_SAMPLE_FMT_FLTP), 44100,
fprintf(stderr, "aformat: %s\n", strbuf);
err = avfilter_graph_create_filter(&aformat_ctx, aformat,
                                   NULL, strbuf, NULL, filter_graph);
if (err < 0) {
    av_log(NULL, AV_LOG_ERROR, "unable to create aformat filter\n");
    return err;
// create asetnsamples filter
snprintf(strbuf, sizeof(strbuf),
fprintf(stderr, "asetnsamples: %s\n", strbuf);
err = avfilter_graph_create_filter(&asetnsamples_ctx, asetnsamples,
                                   NULL, strbuf, NULL, filter_graph);
if (err < 0) {
    av_log(NULL, AV_LOG_ERROR, "unable to create asetnsamples filter\n");
    return err;
// create abuffersink filter
err = avfilter_graph_create_filter(&abuffersink_ctx, abuffersink,
                                   NULL, NULL, NULL, filter_graph);
if (err < 0) {
    av_log(NULL, AV_LOG_ERROR, "unable to create aformat filter\n");
    return err;

// connect inputs and outputs
if (err >= 0) err = avfilter_link(abuffer_ctx, 0, aformat_ctx, 0);
if (err >= 0) err = avfilter_link(aformat_ctx, 0, asetnsamples_ctx, 0);
if (err >= 0) err = avfilter_link(asetnsamples_ctx, 0, abuffersink_ctx, 0);
if (err < 0) {
    av_log(NULL, AV_LOG_ERROR, "error connecting filters\n");
    return err;
err = avfilter_graph_config(filter_graph, NULL);
if (err < 0) {
    av_log(NULL, AV_LOG_ERROR, "error configuring the filter graph\n");
    return err;
return 0;

Now the resulting frame has 1024 samples but the audio is still choppy.


2 Answers


Finally, I get rid of this issue with the solution from here.

This is the code of the filter creation for my setup (resampling to 44.1KHz)

AVFilterGraph *filter_graph = NULL;
AVFilterContext *buffersrc_ctx = NULL;
AVFilterContext *buffersink_ctx = NULL;
QString filter_description = "aresample=44100,aformat=sample_fmts=fltp:channel_layouts=stereo,asetnsamples=n=1024:p=0";
* Initialize conversion filter */
int initialize_audio_filter(AVStream *inputStream) {
char args[512];
int ret;
AVFilter *buffersrc = avfilter_get_by_name("abuffer");
AVFilter *buffersink = avfilter_get_by_name("abuffersink");
AVFilterInOut *outputs = avfilter_inout_alloc();
AVFilterInOut *inputs = avfilter_inout_alloc();
filter_graph = avfilter_graph_alloc();
const enum AVSampleFormat out_sample_fmts[] = {AV_SAMPLE_FMT_FLTP, AV_SAMPLE_FMT_NONE};
const int64_t out_channel_layouts[] = {AV_CH_LAYOUT_STEREO, -1};
const int out_sample_rates[] = {44100, -1};

snprintf(args, sizeof(args), "time_base=%d/%d:sample_rate=%d:sample_fmt=%s:channel_layout=0x%" PRIx64,
         inputStream->codec->time_base.num, inputStream->codec->time_base.den,
ret = avfilter_graph_create_filter(&buffersrc_ctx, buffersrc, "in", args, NULL, filter_graph);

if (ret < 0) {
    svsCritical("", QString("Could not create filter graph, error: %1").arg(svsAvErrorToFormattedString(ret)))
    return -1;

ret = avfilter_graph_create_filter(&buffersink_ctx, buffersink, "out", NULL, NULL, filter_graph);

if (ret < 0) {
    svsCritical("", QString("Cannot create buffer sink, error: %1").arg(svsAvErrorToFormattedString(ret)))
    return ret;

ret = av_opt_set_int_list(buffersink_ctx, "sample_fmts", out_sample_fmts, -1,

if (ret < 0) {
    svsCritical("", QString("Cannot set output sample format, error: %1").arg(svsAvErrorToFormattedString(ret)))
    return ret;

ret = av_opt_set_int_list(buffersink_ctx, "channel_layouts", out_channel_layouts, -1,

if (ret < 0) {
    svsCritical("", QString("Cannot set output channel layout, error: %1").arg(svsAvErrorToFormattedString(ret)))
    return ret;

ret = av_opt_set_int_list(buffersink_ctx, "sample_rates", out_sample_rates, -1,

if (ret < 0) {
    svsCritical("", QString("Cannot set output sample rate, error: %1").arg(svsAvErrorToFormattedString(ret)))
    return ret;

/* Endpoints for the filter graph. */
outputs -> name = av_strdup("in");
outputs -> filter_ctx = buffersrc_ctx;
outputs -> pad_idx = 0;
outputs -> next = NULL;
/* Endpoints for the filter graph. */
inputs -> name = av_strdup("out");
inputs -> filter_ctx = buffersink_ctx;
inputs -> pad_idx = 0;
inputs -> next = NULL;

if ((ret = avfilter_graph_parse_ptr(filter_graph, filter_description.toStdString().c_str(), &inputs, &outputs, NULL)) < 0) {
    svsCritical("", QString("Could not add the filter to graph, error: %1").arg(svsAvErrorToFormattedString(ret)))

if ((ret = avfilter_graph_config(filter_graph, NULL)) < 0) {
    svsCritical("", QString("Could not configure the graph, error: %1").arg(svsAvErrorToFormattedString(ret)))

/* Print summary of the sink buffer
 * Note: args buffer is reused to store channel layout string */
AVFilterLink *outlink = buffersink_ctx->inputs[0];
av_get_channel_layout_string(args, sizeof(args), -1, outlink->channel_layout);
svsInfo("", QString::asprintf("Output: srate:%dHz fmt:%s chlayout:%s\n",
                              (int) outlink->sample_rate,
                              (char *) av_x_if_null(av_get_sample_fmt_name((AVSampleFormat) outlink->format), "?"),
return 0;

And the filter usage:

AVFrame* resampleAudio(const QString& key, AVFrame *frame) {

    /* Push the decoded frame into the filtergraph */
    qint32 ret;
    ret = av_buffersrc_add_frame_flags(buffersrc_ctx, frame, AV_BUFFERSRC_FLAG_KEEP_REF);
    if(ret < 0) {
        svsWarning(key, QString("Error adding frame to buffer: %1").arg(svsAvErrorToFormattedString(ret)))
        // Delete input frame and return null
        return nullptr;

    AVFrame *resampled_frame = av_frame_alloc();

    /* Pull filtered frames from the filtergraph */
    ret = av_buffersink_get_frame(buffersink_ctx, resampled_frame);

    /* Set the timestamp on the resampled frame */
    resampled_frame->best_effort_timestamp = resampled_frame->pts;

    if(ret < 0) {
        // This is very common. For 48KHz -> 44.1KHz for some input frames the
        // filter has not data enough to generate another one.
        return nullptr;
    return resampled_frame;

Its important to set the best_effort_timestamp on the resampled frame to make it work. But the PTS of this frame is set by the filter.


Don't recreate the SwrContext for each frame. It needs to carry some data over from one frame to the next to smooth out the "edges" between resampled frames.

Create a single SwrContext when you start playing audio and call swr_convert_frame for each frame.