Compute the full AVCSampleEntry (avc1 box).

This code isn't pretty exactly---particularly the hardcoded lengths---but it does work. I'll have a different mechanism for calculating the length and nesting structure forthe more dynamic parts of the moov atom. This way is convenient when generating a single string of mostly static data.
2025-07-28 17:50:59 -04:00 · 2016-01-09 21:13:05 -08:00 · 2016-01-09 21:13:05 -08:00 · 15b1ee54a6
commit 15b1ee54a6
parent 48d0473a4c
4 changed files with 121 additions and 70 deletions
--- a/src/coding.h
+++ b/src/coding.h
@ -139,6 +139,11 @@ inline void AppendU16(uint16_t in, std::string *out) {
  out->append(reinterpret_cast<const char *>(&net), sizeof(uint16_t));
 }

+inline void Append16(int16_t in, std::string *out) {
+  int16_t net = ToNetwork16(in);
+  out->append(reinterpret_cast<const char *>(&net), sizeof(int16_t));
+}
+
 inline void AppendU32(uint32_t in, std::string *out) {
  uint32_t net = ToNetworkU32(in);
  out->append(reinterpret_cast<const char *>(&net), sizeof(uint32_t));
--- a/src/h264-test.cc
+++ b/src/h264-test.cc
@ -66,23 +66,26 @@ TEST(H264Test, DecodeOnly) {
 }

 TEST(H264Test, SampleData) {
-  const uint8_t kTestOutput[] = {0x01, 0x4d, 0x00, 0x1f, 0xff, 0xe1, 0x00, 0x17,
-                                 0x67, 0x4d, 0x00, 0x1f, 0x9a, 0x66, 0x02, 0x80,
-                                 0x2d, 0xff, 0x35, 0x01, 0x01, 0x01, 0x40, 0x00,
-                                 0x00, 0xfa, 0x00, 0x00, 0x1d, 0x4c, 0x01, 0x01,
-                                 0x00, 0x04, 0x68, 0xee, 0x3c, 0x80};
+  const char kTestOutput[] =
+      "00 00 00 84 61 76 63 31 00 00 00 00 00 00 00 01 "
+      "00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 "
+      "05 00 02 d0 00 48 00 00 00 48 00 00 00 00 00 00 "
+      "00 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 "
+      "00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 "
+      "00 00 00 18 ff ff 00 00 00 2e 61 76 63 43 01 4d "
+      "00 1f ff e1 00 17 67 4d 00 1f 9a 66 02 80 2d ff "
+      "35 01 01 01 40 00 00 fa 00 00 1d 4c 01 01 00 04 "
+      "68 ee 3c 80";

  re2::StringPiece test_input(reinterpret_cast<const char *>(kTestInput),
                              sizeof(kTestInput));
-  re2::StringPiece test_output(reinterpret_cast<const char *>(kTestOutput),
-                               sizeof(kTestOutput));
-  std::string avc_decoder_config;
+  std::string sample_entry;
  std::string error_message;
  ASSERT_TRUE(
-      ParseH264ExtraData(test_input, &avc_decoder_config, &error_message))
+      GetH264SampleEntry(test_input, 1280, 720, &sample_entry, &error_message))
      << error_message;

-  EXPECT_EQ(ToHex(test_output), ToHex(avc_decoder_config));
+  EXPECT_EQ(kTestOutput, ToHex(sample_entry));
 }

 }  // namespace
--- a/src/h264.cc
+++ b/src/h264.cc
@ -41,17 +41,47 @@ namespace moonfire_nvr {

 namespace {

-// See ISO/IEC 14496-10 section 7.1.
+// See ISO/IEC 14496-10 table 7-1 - NAL unit type codes, syntax element
+// categories, and NAL unit type classes.
 const int kNalUnitSeqParameterSet = 7;
 const int kNalUnitPicParameterSet = 8;

+// Parse sequence parameter set and picture parameter set from ffmpeg's
+// "extra_data".
+bool ParseExtraData(re2::StringPiece extra_data, std::string *sps,
+                    std::string *pps, std::string *error_message) {
+  bool ok = true;
+  internal::NalUnitFunction fn = [&ok, sps, pps,
+                                  error_message](re2::StringPiece nal_unit) {
+    // See ISO/IEC 14496-10 section 7.3.1, which defines nal_unit.
+    uint8_t nal_type = nal_unit[0] & 0x1F;  // bottom 5 bits of first byte.
+    switch (nal_type) {
+      case kNalUnitSeqParameterSet:
+        *sps = nal_unit.as_string();
+        break;
+      case kNalUnitPicParameterSet:
+        *pps = nal_unit.as_string();
+        break;
+      default:
+        *error_message =
+            StrCat("Expected only SPS and PPS; got type ", nal_type);
+        ok = false;
+        return IterationControl::kBreak;
+    }
+    return IterationControl::kContinue;
+  };
+  if (!internal::DecodeH264AnnexB(extra_data, fn, error_message) || !ok) {
+    return false;
+  }
+  if (sps->empty() || pps->empty()) {
+    *error_message = "SPS and PPS must be specified.";
+    return false;
+  }
+  return true;
+}
+
 }  // namespace

-// See T-REC-H.264-201003-S||PDF-E.PDF page 325 for byte stream NAL unit
-// syntax
-
-// See page 42 for nal_unit.
-
 namespace internal {

 // See ISO/IEC 14496-10 section B.2: Byte stream NAL unit decoding process.
@ -94,47 +124,47 @@ bool DecodeH264AnnexB(re2::StringPiece data, NalUnitFunction process_nal_unit,

 }  // namespace internal

-bool ParseH264ExtraData(re2::StringPiece extra_data,
-                        std::string *avc_decoder_config,
+bool GetH264SampleEntry(re2::StringPiece extra_data, uint16_t width,
+                        uint16_t height, std::string *out,
                        std::string *error_message) {
  std::string sps;
  std::string pps;
-  bool ok = true;
-  internal::NalUnitFunction fn = [&ok, &sps, &pps,
-                                  error_message](re2::StringPiece nal_unit) {
-    uint8_t nal_type = nal_unit[0] & 0x1F;  // bottom 5 bits of first byte.
-    switch (nal_type) {
-      case kNalUnitSeqParameterSet:
-        sps = nal_unit.as_string();
-        break;
-      case kNalUnitPicParameterSet:
-        pps = nal_unit.as_string();
-        break;
-      default:
-        *error_message =
-            StrCat("Expected only SPS and PPS; got type ", nal_type);
-        ok = false;
-        return IterationControl::kBreak;
-    }
-    return IterationControl::kContinue;
-  };
-  if (!internal::DecodeH264AnnexB(extra_data, fn, error_message) || !ok) {
-    return false;
-  }
-  if (sps.empty() || pps.empty()) {
-    *error_message = "SPS and PPS must be specified.";
-    return false;
-  }
-  if (sps.size() < 4) {
-    *error_message = "SPS record is too short.";
-    return false;
-  }
-  if (sps.size() > std::numeric_limits<uint16_t>::max() ||
-      pps.size() > std::numeric_limits<uint16_t>::max()) {
-    *error_message = "SPS or PPS is too long.";
+  if (!ParseExtraData(extra_data, &sps, &pps, error_message)) {
    return false;
  }

+  // These match the size of all fields below.
+  // Don't panic; they're verified at the end.
+  uint32_t avcc_len = 19 + sps.size() + pps.size();
+  uint32_t avc1_len = 86 + avcc_len;
+
+  // This is a concatenation of the following boxes/classes.
+  // SampleEntry, ISO/IEC 14496-10 section 8.5.2.
+  uint32_t avc1_len_pos = out->size();
+  AppendU32(avc1_len, out);  // length
+  out->append("avc1");       // type
+  out->append(6, '\x00');    // reserved
+  AppendU16(1, out);         // data_reference_index = 1
+
+  // VisualSampleEntry, ISO/IEC 14496-12 section 12.1.3.
+  out->append(16, '\x00');  // pre_defined + reserved
+  AppendU16(width, out);
+  AppendU16(height, out);
+  AppendU32(UINT32_C(0x00480000), out);  // horizresolution
+  AppendU32(UINT32_C(0x00480000), out);  // vertresolution
+  AppendU32(0, out);                     // reserved
+  AppendU16(1, out);                     // frame count
+  out->append(32, '\x00');               // compressorname
+  AppendU16(0x0018, out);                // depth
+  Append16(-1, out);                     // pre_defined
+
+  // AVCSampleEntry, ISO/IEC 14496-15 section 5.3.4.1.
+  // AVCConfigurationBox, ISO/IEC 14496-15 section 5.3.4.1.
+  uint32_t avcc_len_pos = out->size();
+  AppendU32(avcc_len, out);  // length
+  out->append("avcC");       // type
+
+  // AVCDecoderConfiguration, ISO/IEC 14496-15 section 5.2.4.1.
  // The beginning of the AVCDecoderConfiguration takes a few values from
  // the SPS (ISO/IEC 14496-10 section 7.3.2.1.1). One caveat: that section
  // defines the syntax in terms of RBSP, not NAL. The difference is the
@ -142,27 +172,41 @@ bool ParseH264ExtraData(re2::StringPiece extra_data,
  // "emulation_prevention_three_byte" in ISO/IEC 14496-10 section 7.4.
  // It looks like 00 is not a valid value of profile_idc, so this distinction
  // shouldn't be relevant here. And ffmpeg seems to ignore it.
-  avc_decoder_config->clear();
-  avc_decoder_config->push_back(1);       // configurationVersion
-  avc_decoder_config->push_back(sps[1]);  // profile_idc -> AVCProfileIndication
-  avc_decoder_config->push_back(sps[2]);  // ... -> profile_compatibility
-  avc_decoder_config->push_back(sps[3]);  // level_idc -> AVCLevelIndication
+  out->push_back(1);       // configurationVersion
+  out->push_back(sps[1]);  // profile_idc -> AVCProfileIndication
+  out->push_back(sps[2]);  // ...misc bits... -> profile_compatibility
+  out->push_back(sps[3]);  // level_idc -> AVCLevelIndication

  // Hardcode lengthSizeMinusOne to 3. This needs to match what ffmpeg uses
  // when generating AVCParameterSamples (ISO/IEC 14496-15 section 5.3.2).
  // There doesn't seem to be a clean way to get this from ffmpeg, but it's
  // always 3.
-  avc_decoder_config->push_back(static_cast<char>(0xff));
+  out->push_back(static_cast<char>(0xff));

  // Only support one SPS and PPS.
  // ffmpeg's ff_isom_write_avcc has the same limitation, so it's probably fine.
  // This next byte is a reserved 0b111 + a 5-bit # of SPSs (1).
-  avc_decoder_config->push_back(static_cast<char>(0xe1));
-  AppendU16(sps.size(), avc_decoder_config);
-  avc_decoder_config->append(sps.data(), sps.size());
-  avc_decoder_config->push_back(1);  // # of PPSs.
-  AppendU16(pps.size(), avc_decoder_config);
-  avc_decoder_config->append(pps.data(), pps.size());
+  out->push_back(static_cast<char>(0xe1));
+  AppendU16(sps.size(), out);
+  out->append(sps.data(), sps.size());
+  out->push_back(1);  // # of PPSs.
+  AppendU16(pps.size(), out);
+  out->append(pps.data(), pps.size());
+
+  if (out->size() - avcc_len_pos != avcc_len) {
+    *error_message =
+        StrCat("internal error: anticipated AVCConfigurationBox length ",
+               avcc_len, ", but was actually ", out->size() - avcc_len_pos,
+               "; sps length ", sps.size(), ", pps length ", pps.size());
+    return false;
+  }
+  if (out->size() - avc1_len_pos != avc1_len) {
+    *error_message =
+        StrCat("internal error: anticipated AVCSampleEntry length ", avc1_len,
+               ", but was actually ", out->size() - avc1_len_pos,
+               "; sps length ", sps.size(), ", pps length ", pps.size());
+    return false;
+  }

  return true;
 }
--- a/src/h264.h
+++ b/src/h264.h
@ -59,7 +59,7 @@ using NalUnitFunction =
    std::function<IterationControl(re2::StringPiece nal_unit)>;

 // Decode a H.264 Annex B byte stream into NAL units.
-// For ParseH264ExtraData; exposed for testing.
+// For GetH264SampleEntry; exposed for testing.
 // Calls |process_nal_unit| for each NAL unit in the byte stream.
 //
 // Note: this won't spot all invalid byte streams. For example, several 0x00s
@ -70,12 +70,11 @@ bool DecodeH264AnnexB(re2::StringPiece data, NalUnitFunction process_nal_unit,

 }  // namespace

-// Parse H.264 "extra data" (as supplied by ffmpeg, an Annex B byte stream
-// containing SPS and PPS NAL units). On success, fills |avc_decoder_config|
-// with an AVCDecoderConfigurationRecord as in ISO/IEC 14496-15 section
-// 5.2.4.1.
-bool ParseH264ExtraData(re2::StringPiece extra_data,
-                        std::string *avc_decoder_config,
+// Gets a H.264 sample entry (AVCSampleEntry, which extends
+// VisualSampleEntry), given the "extra_data", width, and height supplied by
+// ffmpeg.
+bool GetH264SampleEntry(re2::StringPiece extra_data, uint16_t width,
+                        uint16_t height, std::string *out,
                        std::string *error_message);

 }  // namespace moonfire_nvr