RTP depacketization

Question

I'm trying to parse RTP AVC video stream to prepare it for H264 Decoder.

This is my packet stream captured by Wireshark First I'm trying to find IDR slice, SPS and PPS parameters, so this is it https://dl.dropboxusercontent.com/u/76817805/frame.pcapng

Next I'm doing following:

1) Find PPS nad SPS data and copying them to NAL units into depacketization buffer with [0x00 0x00 0x01] start sequense.

[00 00 01 | SPS][00 00 01 | PPS]

2) for packets started with [0x7C 0x85] (Start bit = 1)I'm reconstructing first NAL heder (0x65 for my case) and copying data folowed by 0x7C 0x85 into depacketization buffer with start sequense.

[00 00 01 65 | video data ......]

3) for packets started with [0x7C 0x05] I'm copying data except 2 first bytes into depacketization buffer.

[..... video data .....]

4) for packets started with [0x7C 0x45] (Stop bit = 1) I'm copying data except 2 first bytes into depacketization buffer. [..... video data (last byte)]

5) For not fragmented packets I'm just copying data into depacketization buffer wiyh start sequense.

[00 00 01 | NALu]

So in the end of parsing example video stream I've got this binary file https://dl.dropboxusercontent.com/u/76817805/raw.264, but it can't be decoded correctly. ![enter image description here][1]

Can anybody help me please and find mistake in my algorithm? What am I doing wrong? Thanks a lot for everybody.

UInt32 parseRTP( Uint8 * buf, int inputDataLen, Uint32 curAdr)
{
int result_len = 0;

// filter zero bytes at the end of packet

for (i = inputDataLen-1; i>0; i--)
{
    if (buf[i] == 0x00) inputDataLen--;
    else break;
}

// get NAL type
nal = buf[0];
type = (nal & 0x1f);

if ((buf[0] == 0x7C) && (buf[1] == 0x85)) IFrameisOK = 1; // Start of I frame

if (type == 6)
    return 0;

if (type == 7) // new SPS
{
    memcpy((void*)sps, start_sequence, sizeof(start_sequence));
    memcpy((void*)(sps + sizeof(start_sequence)), buf, inputDataLen);
    sps_len = inputDataLen + sizeof(start_sequence);
    SPSisOK = 1;
    return 0;
}

if (type == 8) // new PPS
{
    memcpy((void*)pps, start_sequence, sizeof(start_sequence));
    memcpy((void*)(pps + sizeof(start_sequence)), buf, inputDataLen);
    pps_len = inputDataLen + sizeof(start_sequence);
    PPSisOK = 1;
    return 0;
}



if (SPSisOK == 1 && PPSisOK == 1)
{
    if (IFrameisOK == 0) return 0; // wait I-frame

    /*  Simplify the case.
        These are all the nal types used internally by the h264 codec
     */
    if (type >= 1 && type <= 23) type = 1;
    switch (type)
        {
            case 0: // undefined;
                break;

            case 1:
                // copy start sequence

                memcpy((void*)curAdr, start_sequence, sizeof(start_sequence));
                curAdr += sizeof(start_sequence);

                // copy data
                memcpy((void*)curAdr, buf, inputDataLen);
                curAdr += inputDataLen;

                result_len = sizeof(start_sequence) +  inputDataLen;
                break;

            case 24: // STAP-A (one packet, multiple nals)  not used in this project
                break;
            case 25: // STAP-B
            case 26: // MTAP-16
            case 27: // MTAP-24
            case 29: // FU-B
                //not used in this project
                break;
            case 28: // FU-A (fragmented nal)

                inputDataLen -= 2; // delete 2 first bytes for fragmented units
                //skip the fu_indicator
                buf++;

                Uint8 fu_indicator = nal;
                Uint8 fu_header = *buf; // read the fu_header.
                Uint8 start_bit = fu_header >> 7;
                Uint8 reconstructed_nal;
                Uint8 nal_type = (fu_header & 0x1f);


                /* reconstruct this packet's true nal; only the
                data follows..*/
                reconstructed_nal = fu_indicator & (0xe0);

                /*the original nal forbidden bit and NRI are stored in this
                packet's nal*/
                reconstructed_nal |= nal_type;

                // skip the fu_header...
                buf++;

                if(start_bit)
                {
                    if (NEED_CONFIGS)
                    {
                        // copy SPS and PPS first
                        memcpy((void*)curAdr, sps, sps_len);
                        curAdr += sps_len;

                        memcpy((void*)curAdr, pps, pps_len);
                        curAdr += pps_len;

                    }

                    // copy in the start sequence
                    memcpy((void*)curAdr, start_sequence, sizeof(start_sequence));
                    curAdr += sizeof(start_sequence);


                    // copy reconstructed nal
                    memcpy((void*)curAdr,&reconstructed_nal, sizeof(reconstructed_nal));
                    curAdr += sizeof(reconstructed_nal);


                    // copy payload
                    memcpy((void*)curAdr,buf, inputDataLen);
                    curAdr += inputDataLen;

                    if (NEED_CONFIGS)
                    {
                        result_len = (sps_len + pps_len + sizeof(start_sequence) + sizeof(reconstructed_nal) + inputDataLen);
                        NEED_CONFIGS = 0;
                    }
                    else
                    {
                        result_len += (sizeof(start_sequence) + sizeof(reconstructed_nal) + inputDataLen);
                    }

                }
                else
                {

                    memcpy((void*)curAdr,buf, inputDataLen);
                    curAdr += inputDataLen;

                    result_len = inputDataLen;
                }


                break;
            default:
                break;
        }
    return result_len;
}
else
{
    return 0;
}

}

Roman R. Roman R. · Accepted Answer · 2014-08-18T11:38:40

Depacketization rules are described in RFC 6184 - RTP Payload Format for H.264 Video and you should follow them rather then trying to invent your own.

Your assumption that prepending a fragment with 00 00 01 65 is reconstructing a NAL unit is incorrect.

The idea is that this NAL unit is too large to fit single packet, and then it's fragmented into parts. You are to receive several RTP fragments and then combine them into single NAL unit reconstructing it in full, and in its original state. See section 5.8. Fragmentation Units (FUs) for details.

Instead of following the mentioned above, you are simply adding 00 00 01 65 to each part of the NAL unit - this is expectedly not going to produce decodable output.

RTP depacketization

1 Answers