AnnexB->AVC sample data converter

I discovered that the mp4 files I was writing were viewable in VLC and in Chrome-on-desktop (ffmpeg-based) but not in Chrome-on-Android (libstagefright-based). It turns out that I was writing Annex B sample data rather than the correct AVCParameterSample format. ffmpeg gives both the "extradata" and the actual frames in Annex B format when reading from rtsp. This is still my simple, unoptimized implementation of the Annex B parser. My Raspberry Pi 2 is still able to record my six streams using about 30% of 1 core, so it will do for the moment at least.
2025-05-01 23:43:44 -04:00 · 2016-02-02 20:18:41 -08:00 · 2016-02-02 20:18:41 -08:00 · ca368944ec
commit ca368944ec
parent 8ee1ab1c7b
5 changed files with 142 additions and 83 deletions
--- a/src/ffmpeg.h
+++ b/src/ffmpeg.h
@ -46,6 +46,7 @@
 #include <string>

 #include <glog/logging.h>
+#include <re2/stringpiece.h>

 extern "C" {
 #include <libavformat/avformat.h>
@ -74,6 +75,11 @@ class VideoPacket {
  AVPacket *pkt() { return &pkt_; }
  const AVPacket *pkt() const { return &pkt_; }

+  re2::StringPiece data() {
+    return re2::StringPiece(reinterpret_cast<const char *>(pkt_.data),
+                            pkt_.size);
+  }
+
 private:
  AVPacket pkt_;
 };
@ -99,6 +105,12 @@ class InputVideoPacketStream {

  // Returns the video stream.
  virtual const AVStream *stream() const = 0;
+
+  re2::StringPiece extradata() const {
+    return re2::StringPiece(
+        reinterpret_cast<const char *>(stream()->codec->extradata),
+        stream()->codec->extradata_size);
+  }
 };

 // A class which opens streams.
--- a/src/h264-test.cc
+++ b/src/h264-test.cc
@ -82,29 +82,63 @@ TEST(H264Test, DecodeOnly) {
                                   "68 ee 3c 80"));
 }

-TEST(H264Test, SampleDataFromAnnexBExtraData) {
+TEST(H264Test, SampleEntryFromAnnexBExtraData) {
  re2::StringPiece test_input(reinterpret_cast<const char *>(kAnnexBTestInput),
                              sizeof(kAnnexBTestInput));
  std::string sample_entry;
  std::string error_message;
-  ASSERT_TRUE(
-      GetH264SampleEntry(test_input, 1280, 720, &sample_entry, &error_message))
+  bool need_transform;
+  ASSERT_TRUE(ParseExtraData(test_input, 1280, 720, &sample_entry,
+                             &need_transform, &error_message))
      << error_message;

  EXPECT_EQ(kTestOutput, ToHex(sample_entry, true));
+  EXPECT_TRUE(need_transform);
 }

-TEST(H264Test, SampleDataFromAvcDecoderConfigExtraData) {
+TEST(H264Test, SampleEntryFromAvcDecoderConfigExtraData) {
  re2::StringPiece test_input(
      reinterpret_cast<const char *>(kAvcDecoderConfigTestInput),
      sizeof(kAvcDecoderConfigTestInput));
  std::string sample_entry;
  std::string error_message;
-  ASSERT_TRUE(
-      GetH264SampleEntry(test_input, 1280, 720, &sample_entry, &error_message))
+  bool need_transform;
+  ASSERT_TRUE(ParseExtraData(test_input, 1280, 720, &sample_entry,
+                             &need_transform, &error_message))
      << error_message;

  EXPECT_EQ(kTestOutput, ToHex(sample_entry, true));
+  EXPECT_FALSE(need_transform);
+}
+
+TEST(H264Test, TransformSampleEntry) {
+  const uint8_t kInput[] = {
+      0x00, 0x00, 0x00, 0x01, 0x67, 0x4d, 0x00, 0x1f, 0x9a, 0x66,
+      0x02, 0x80, 0x2d, 0xff, 0x35, 0x01, 0x01, 0x01, 0x40, 0x00,
+      0x00, 0xfa, 0x00, 0x00, 0x1d, 0x4c, 0x01,
+
+      0x00, 0x00, 0x00, 0x01, 0x68, 0xee, 0x3c, 0x80,
+
+      0x00, 0x00, 0x00, 0x01, 0x06, 0x06, 0x01, 0xc4, 0x80,
+
+      0x00, 0x00, 0x00, 0x01, 0x65, 0x88, 0x80, 0x10, 0x00, 0x08,
+      0x7f, 0x00, 0x5d, 0x27, 0xb5, 0xc1, 0xff, 0x8c, 0xd6, 0x35,
+      // (truncated)
+  };
+  const char kExpectedOutput[] =
+      "00 00 00 17 "
+      "67 4d 00 1f 9a 66 02 80 2d ff 35 01 01 01 40 00 00 fa 00 00 1d 4c 01 "
+      "00 00 00 04 68 ee 3c 80 "
+      "00 00 00 05 06 06 01 c4 80 "
+      "00 00 00 10 "
+      "65 88 80 10 00 08 7f 00 5d 27 b5 c1 ff 8c d6 35";
+  re2::StringPiece input(reinterpret_cast<const char *>(kInput),
+                         sizeof(kInput));
+  std::string out;
+  std::string error_message;
+  ASSERT_TRUE(TransformSampleData(input, &out, &error_message))
+      << error_message;
+  EXPECT_EQ(kExpectedOutput, ToHex(out, true));
 }

 }  // namespace
--- a/src/h264.cc
+++ b/src/h264.cc
@ -46,15 +46,16 @@ namespace {
 const int kNalUnitSeqParameterSet = 7;
 const int kNalUnitPicParameterSet = 8;

+const uint8_t kNalUnitTypeMask = 0x1F;  // bottom 5 bits of first byte of unit.
+
 // Parse sequence parameter set and picture parameter set from ffmpeg's
 // "extra_data".
 bool ParseAnnexBExtraData(re2::StringPiece extradata, re2::StringPiece *sps,
                          re2::StringPiece *pps, std::string *error_message) {
  bool ok = true;
-  internal::NalUnitFunction fn = [&ok, sps, pps,
-                                  error_message](re2::StringPiece nal_unit) {
+  internal::NalUnitFunction fn = [&](re2::StringPiece nal_unit) {
    // See ISO/IEC 14496-10 section 7.3.1, which defines nal_unit.
-    uint8_t nal_type = nal_unit[0] & 0x1F;  // bottom 5 bits of first byte.
+    uint8_t nal_type = nal_unit[0] & kNalUnitTypeMask;
    switch (nal_type) {
      case kNalUnitSeqParameterSet:
        *sps = nal_unit;
@ -85,8 +86,7 @@ bool ParseAnnexBExtraData(re2::StringPiece extradata, re2::StringPiece *sps,
 namespace internal {

 // See ISO/IEC 14496-10 section B.2: Byte stream NAL unit decoding process.
-// This is a relatively simple, unoptimized implementation given that it
-// only processes a few dozen bytes per recording.
+// This is a relatively simple, unoptimized implementation.
 bool DecodeH264AnnexB(re2::StringPiece data, NalUnitFunction process_nal_unit,
                      std::string *error_message) {
  static const RE2 kStartCode("(\\x00{2,}\\x01)");
@ -125,9 +125,9 @@ bool DecodeH264AnnexB(re2::StringPiece data, NalUnitFunction process_nal_unit,

 }  // namespace internal

-bool GetH264SampleEntry(re2::StringPiece extradata, uint16_t width,
-                        uint16_t height, std::string *out,
-                        std::string *error_message) {
+bool ParseExtraData(re2::StringPiece extradata, uint16_t width, uint16_t height,
+                    std::string *sample_entry, bool *need_transform,
+                    std::string *error_message) {
  uint32_t avcc_len;
  re2::StringPiece sps;
  re2::StringPiece pps;
@ -140,42 +140,44 @@ bool GetH264SampleEntry(re2::StringPiece extradata, uint16_t width,

    // This magic value is checked at the end.
    avcc_len = 19 + sps.size() + pps.size();
+    *need_transform = true;
  } else {
    // Assume "extradata" holds an AVCDecoderConfiguration.
    avcc_len = 8 + extradata.size();
+    *need_transform = false;
  }

  // This magic value is also checked at the end.
  uint32_t avc1_len = 86 + avcc_len;

-  out->clear();
-  out->reserve(avc1_len);
+  sample_entry->clear();
+  sample_entry->reserve(avc1_len);

  // This is a concatenation of the following boxes/classes.
  // SampleEntry, ISO/IEC 14496-10 section 8.5.2.
-  uint32_t avc1_len_pos = out->size();
-  AppendU32(avc1_len, out);  // length
-  out->append("avc1");       // type
-  out->append(6, '\x00');    // reserved
-  AppendU16(1, out);         // data_reference_index = 1
+  uint32_t avc1_len_pos = sample_entry->size();
+  AppendU32(avc1_len, sample_entry);  // length
+  sample_entry->append("avc1");       // type
+  sample_entry->append(6, '\x00');    // reserved
+  AppendU16(1, sample_entry);         // data_reference_index = 1

  // VisualSampleEntry, ISO/IEC 14496-12 section 12.1.3.
-  out->append(16, '\x00');  // pre_defined + reserved
-  AppendU16(width, out);
-  AppendU16(height, out);
-  AppendU32(UINT32_C(0x00480000), out);  // horizresolution
-  AppendU32(UINT32_C(0x00480000), out);  // vertresolution
-  AppendU32(0, out);                     // reserved
-  AppendU16(1, out);                     // frame count
-  out->append(32, '\x00');               // compressorname
-  AppendU16(0x0018, out);                // depth
-  Append16(-1, out);                     // pre_defined
+  sample_entry->append(16, '\x00');  // pre_defined + reserved
+  AppendU16(width, sample_entry);
+  AppendU16(height, sample_entry);
+  AppendU32(UINT32_C(0x00480000), sample_entry);  // horizresolution
+  AppendU32(UINT32_C(0x00480000), sample_entry);  // vertresolution
+  AppendU32(0, sample_entry);                     // reserved
+  AppendU16(1, sample_entry);                     // frame count
+  sample_entry->append(32, '\x00');               // compressorname
+  AppendU16(0x0018, sample_entry);                // depth
+  Append16(-1, sample_entry);                     // pre_defined

  // AVCSampleEntry, ISO/IEC 14496-15 section 5.3.4.1.
  // AVCConfigurationBox, ISO/IEC 14496-15 section 5.3.4.1.
-  uint32_t avcc_len_pos = out->size();
-  AppendU32(avcc_len, out);  // length
-  out->append("avcC");       // type
+  uint32_t avcc_len_pos = sample_entry->size();
+  AppendU32(avcc_len, sample_entry);  // length
+  sample_entry->append("avcC");       // type

  if (!sps.empty() && !pps.empty()) {
    // Create the AVCDecoderConfiguration, ISO/IEC 14496-15 section 5.2.4.1.
@ -186,43 +188,42 @@ bool GetH264SampleEntry(re2::StringPiece extradata, uint16_t width,
    // "emulation_prevention_three_byte" in ISO/IEC 14496-10 section 7.4.
    // It looks like 00 is not a valid value of profile_idc, so this distinction
    // shouldn't be relevant here. And ffmpeg seems to ignore it.
-    out->push_back(1);       // configurationVersion
-    out->push_back(sps[1]);  // profile_idc -> AVCProfileIndication
-    out->push_back(sps[2]);  // ...misc bits... -> profile_compatibility
-    out->push_back(sps[3]);  // level_idc -> AVCLevelIndication
+    sample_entry->push_back(1);       // configurationVersion
+    sample_entry->push_back(sps[1]);  // profile_idc -> AVCProfileIndication
+    sample_entry->push_back(
+        sps[2]);  // ...misc bits... -> profile_compatibility
+    sample_entry->push_back(sps[3]);  // level_idc -> AVCLevelIndication

-    // Hardcode lengthSizeMinusOne to 3. This needs to match what ffmpeg uses
-    // when generating AVCParameterSamples (ISO/IEC 14496-15 section 5.3.2).
-    // There doesn't seem to be a clean way to get this from ffmpeg, but it's
-    // always 3.
-    out->push_back(static_cast<char>(0xff));
+    // Hardcode lengthSizeMinusOne to 3, matching TransformSampleData's 4-byte
+    // lengths.
+    sample_entry->push_back(static_cast<char>(0xff));

    // Only support one SPS and PPS.
    // ffmpeg's ff_isom_write_avcc has the same limitation, so it's probably
    // fine. This next byte is a reserved 0b111 + a 5-bit # of SPSs (1).
-    out->push_back(static_cast<char>(0xe1));
-    AppendU16(sps.size(), out);
-    out->append(sps.data(), sps.size());
-    out->push_back(1);  // # of PPSs.
-    AppendU16(pps.size(), out);
-    out->append(pps.data(), pps.size());
+    sample_entry->push_back(static_cast<char>(0xe1));
+    AppendU16(sps.size(), sample_entry);
+    sample_entry->append(sps.data(), sps.size());
+    sample_entry->push_back(1);  // # of PPSs.
+    AppendU16(pps.size(), sample_entry);
+    sample_entry->append(pps.data(), pps.size());

-    if (out->size() - avcc_len_pos != avcc_len) {
-      *error_message =
-          StrCat("internal error: anticipated AVCConfigurationBox length ",
-                 avcc_len, ", but was actually ", out->size() - avcc_len_pos,
-                 "; sps length ", sps.size(), ", pps length ", pps.size());
+    if (sample_entry->size() - avcc_len_pos != avcc_len) {
+      *error_message = StrCat(
+          "internal error: anticipated AVCConfigurationBox length ", avcc_len,
+          ", but was actually ", sample_entry->size() - avcc_len_pos,
+          "; sps length ", sps.size(), ", pps length ", pps.size());
      return false;
    }

  } else {
-    out->append(extradata.data(), extradata.size());
+    sample_entry->append(extradata.data(), extradata.size());
  }

-  if (out->size() - avc1_len_pos != avc1_len) {
+  if (sample_entry->size() - avc1_len_pos != avc1_len) {
    *error_message =
        StrCat("internal error: anticipated AVCSampleEntry length ", avc1_len,
-               ", but was actually ", out->size() - avc1_len_pos,
+               ", but was actually ", sample_entry->size() - avc1_len_pos,
               "; sps length ", sps.size(), ", pps length ", pps.size());
    return false;
  }
@ -230,4 +231,21 @@ bool GetH264SampleEntry(re2::StringPiece extradata, uint16_t width,
  return true;
 }

+bool TransformSampleData(re2::StringPiece annexb_sample,
+                         std::string *avc_sample, std::string *error_message) {
+  // See AVCParameterSamples, ISO/IEC 14496-15 section 5.3.2.
+  avc_sample->clear();
+  auto fn = [&](re2::StringPiece nal_unit) {
+    // 4-byte length; this must be in sync with ParseExtraData's
+    // lengthSizeMinusOne == 3.
+    AppendU32(nal_unit.size(), avc_sample);
+    avc_sample->append(nal_unit.data(), nal_unit.size());
+    return IterationControl::kContinue;
+  };
+  if (!internal::DecodeH264AnnexB(annexb_sample, fn, error_message)) {
+    return false;
+  }
+  return true;
+}
+
 }  // namespace moonfire_nvr
--- a/src/h264.h
+++ b/src/h264.h
@ -29,20 +29,16 @@
 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
 //
 // h264.h: H.264 decoding. For the most part, Moonfire NVR does not try to
-// understand the video codec. There's one exception. It must construct the
-// .mp4 sample description table, and for AVC, this includes the ISO/IEC
-// 14496-15 section 5.2.4.1 AVCDecoderConfigurationRecord.
+// understand the video codec. However, H.264 has two byte stream encodings:
+// ISO/IEC 14496-10 Annex B, and ISO/IEC 14496-15 AVC access units.
+// When streaming from RTSP, ffmpeg supplies the former. We need the latter
+// to stick into .mp4 files. This file manages the conversion, both for
+// the ffmpeg "extra data" (which should become the ISO/IEC 14496-15
+// section 5.2.4.1 AVCDecoderConfigurationRecord) and the actual samples.
 //
-// When handling a RTSP input source, ffmpeg supplies as "extradata" an
-// ISO/IEC 14496-10 Annex B byte stream containing SPS (sequence parameter
-// set) and PPS (picture parameter set) NAL units from which this can be
-// constructed. ffmpeg of course also has logic for converting "extradata"
-// to the AVCDecoderConfigurationRecord, but unfortunately it is not exposed
-// except through ffmpeg's own generated .mp4 file. Extracting just this part
-// of their .mp4 files would be more trouble than it's worth.
-//
-// Just to make things interesting, when handling a .mp4 file, ffmpeg supplies
-// as "extradata" an AVCDecoderConfiguration.
+// ffmpeg of course has logic to do the same thing, but unfortunately it is
+// not exposed except through ffmpeg's own generated .mp4 file. Extracting
+// just this part of their .mp4 files would be more trouble than it's worth.

 #ifndef MOONFIRE_NVR_H264_H
 #define MOONFIRE_NVR_H264_H
@ -76,9 +72,12 @@ bool DecodeH264AnnexB(re2::StringPiece data, NalUnitFunction process_nal_unit,
 // Gets a H.264 sample entry (AVCSampleEntry, which extends
 // VisualSampleEntry), given the "extradata", width, and height supplied by
 // ffmpeg.
-bool GetH264SampleEntry(re2::StringPiece extradata, uint16_t width,
-                        uint16_t height, std::string *out,
-                        std::string *error_message);
+bool ParseExtraData(re2::StringPiece extradata, uint16_t width, uint16_t height,
+                    std::string *sample_entry, bool *need_transform,
+                    std::string *error_message);
+
+bool TransformSampleData(re2::StringPiece annexb_sample,
+                         std::string *avc_sample, std::string *error_message);

 }  // namespace moonfire_nvr

--- a/src/mp4-test.cc
+++ b/src/mp4-test.cc
@ -224,12 +224,14 @@ class IntegrationTest : public testing::Test {

    video_sample_entry_.width = in->stream()->codec->width;
    video_sample_entry_.height = in->stream()->codec->height;
-    if (!GetH264SampleEntry(GetExtradata(in.get()), in->stream()->codec->width,
-                            in->stream()->codec->height,
-                            &video_sample_entry_.data, &error_message)) {
+    bool need_transform;
+    if (!ParseExtraData(in->extradata(), in->stream()->codec->width,
+                        in->stream()->codec->height, &video_sample_entry_.data,
+                        &need_transform, &error_message)) {
      ADD_FAILURE() << "GetH264SampleEntry: " << error_message;
      return recording;
    }
+    EXPECT_FALSE(need_transform);

    while (true) {
      VideoPacket pkt;
@ -286,7 +288,7 @@ class IntegrationTest : public testing::Test {
        StrCat(tmpdir_path_, "/clip.new.mp4"), &error_message);
    ASSERT_TRUE(copied != nullptr) << error_message;

-    EXPECT_EQ(GetExtradata(original.get()), GetExtradata(copied.get()));
+    EXPECT_EQ(original->extradata(), copied->extradata());
    EXPECT_EQ(original->stream()->codec->width, copied->stream()->codec->width);
    EXPECT_EQ(original->stream()->codec->height,
              copied->stream()->codec->height);
@ -310,12 +312,6 @@ class IntegrationTest : public testing::Test {
    }
  }

-  re2::StringPiece GetExtradata(InputVideoPacketStream *stream) {
-    return re2::StringPiece(
-        reinterpret_cast<const char *>(stream->stream()->codec->extradata),
-        stream->stream()->codec->extradata_size);
-  }
-
  re2::StringPiece GetData(const VideoPacket &pkt) {
    return re2::StringPiece(reinterpret_cast<const char *>(pkt.pkt()->data),
                            pkt.pkt()->size);