Support trimming video segments.

* If the end of a segment is between samples, the last included sample will have a shortened duration. * If the beginning of a segment not on a key frame (aka sync sample), the prefix will be included but trimmed using an edit list. (It seems like a ctts box might be able to accomplish the same thing, fwiw.)
2024-12-26 07:05:56 -05:00 · 2016-05-01 09:16:14 -07:00 · 2016-05-01 09:16:14 -07:00 · 3030e3fb32
commit 3030e3fb32
parent 713d7863de
4 changed files with 153 additions and 36 deletions
--- a/src/mp4-test.cc
+++ b/src/mp4-test.cc
@ -256,11 +256,11 @@ class IntegrationTest : public testing::Test {
  }

  std::shared_ptr<VirtualFile> CreateMp4FromSingleRecording(
-      const Recording &recording, bool include_ts) {
+      const Recording &recording, int32_t rel_start_90k, int32_t rel_end_90k,
+      bool include_ts) {
    Mp4FileBuilder builder(tmpdir_.get());
    builder.SetSampleEntry(video_sample_entry_);
-    builder.Append(Recording(recording), 0,
-                   std::numeric_limits<int32_t>::max());
+    builder.Append(Recording(recording), rel_start_90k, rel_end_90k);
    builder.include_timestamp_subtitle_track(include_ts);
    std::string error_message;
    auto mp4 = builder.Build(&error_message);
@ -280,7 +280,7 @@ class IntegrationTest : public testing::Test {
    WriteFileOrDie(StrCat(tmpdir_path_, "/clip.new.mp4"), &buf);
  }

-  void CompareMp4s() {
+  void CompareMp4s(int64_t pts_offset) {
    std::string error_message;
    auto original = GetRealVideoSource()->OpenFile("../src/testdata/clip.mp4",
                                                   &error_message);
@ -294,22 +294,33 @@ class IntegrationTest : public testing::Test {
    EXPECT_EQ(original->stream()->codec->height,
              copied->stream()->codec->height);

+    int pkt = 0;
    while (true) {
      VideoPacket original_pkt;
      VideoPacket copied_pkt;

      bool original_has_next = original->GetNext(&original_pkt, &error_message);
-      ASSERT_TRUE(original_has_next || error_message.empty()) << error_message;
+      ASSERT_TRUE(original_has_next || error_message.empty())
+          << "pkt " << pkt << ": " << error_message;
      bool copied_has_next = copied->GetNext(&copied_pkt, &error_message);
-      ASSERT_TRUE(copied_has_next || error_message.empty()) << error_message;
+      ASSERT_TRUE(copied_has_next || error_message.empty())
+          << "pkt " << pkt << ": " << error_message;
      if (!original_has_next && !copied_has_next) {
        break;
      }
-      ASSERT_TRUE(original_has_next);
-      ASSERT_TRUE(copied_has_next);
-      EXPECT_EQ(original_pkt.pkt()->pts, copied_pkt.pkt()->pts);
-      EXPECT_EQ(original_pkt.pkt()->duration, copied_pkt.pkt()->duration);
-      EXPECT_EQ(GetData(original_pkt), GetData(copied_pkt));
+      ASSERT_TRUE(original_has_next) << "pkt " << pkt;
+      ASSERT_TRUE(copied_has_next) << "pkt " << pkt;
+      EXPECT_EQ(original_pkt.pkt()->pts + pts_offset, copied_pkt.pkt()->pts)
+          << "pkt " << pkt;
+
+      // One would normally expect the duration to be exactly the same, but
+      // when using an edit list, ffmpeg appears to extend the last packet's
+      // duration by the amount skipped at the beginning. I think this is a
+      // bug on their side.
+      EXPECT_LE(original_pkt.pkt()->duration, copied_pkt.pkt()->duration)
+          << "pkt " << pkt;
+      EXPECT_EQ(GetData(original_pkt), GetData(copied_pkt)) << "pkt " << pkt;
+      ++pkt;
    }
  }

@ -329,9 +340,10 @@ TEST_F(IntegrationTest, RoundTrip) {
  if (HasFailure()) {
    return;
  }
-  auto f = CreateMp4FromSingleRecording(recording, false);
+  auto f = CreateMp4FromSingleRecording(
+      recording, 0, std::numeric_limits<int32_t>::max(), false);
  WriteMp4(f.get());
-  CompareMp4s();
+  CompareMp4s(0);
 }

 TEST_F(IntegrationTest, RoundTripWithSubtitle) {
@ -339,9 +351,21 @@ TEST_F(IntegrationTest, RoundTripWithSubtitle) {
  if (HasFailure()) {
    return;
  }
-  auto f = CreateMp4FromSingleRecording(recording, true);
+  auto f = CreateMp4FromSingleRecording(
+      recording, 0, std::numeric_limits<int32_t>::max(), true);
  WriteMp4(f.get());
-  CompareMp4s();
+  CompareMp4s(0);
+}
+
+TEST_F(IntegrationTest, RoundTripWithEditList) {
+  Recording recording = CopyMp4ToSingleRecording();
+  if (HasFailure()) {
+    return;
+  }
+  auto f = CreateMp4FromSingleRecording(
+      recording, 1, std::numeric_limits<int32_t>::max(), false);
+  WriteMp4(f.get());
+  CompareMp4s(-1);
 }

 TEST_F(IntegrationTest, Metadata) {
@ -349,14 +373,15 @@ TEST_F(IntegrationTest, Metadata) {
  if (HasFailure()) {
    return;
  }
-  auto f = CreateMp4FromSingleRecording(recording, false);
+  auto f = CreateMp4FromSingleRecording(
+      recording, 0, std::numeric_limits<int32_t>::max(), false);

  // This test is brittle, which is the point. Any time the digest comparison
  // here fails, it can be updated, but the etag must change as well!
  // Otherwise clients may combine ranges from the new format with ranges
  // from the old format!
  EXPECT_EQ("1e5331e8371bd97ac3158b3a86494abc87cdc70e", Digest(f.get()));
-  EXPECT_EQ("\"62f5e00a6e1e6dd893add217b1bf7ed7446b8b9d\"", f->etag());
+  EXPECT_EQ("\"a9ce99a83a177516e7f862fed405e2b47b7d4ae3\"", f->etag());

  // 10 seconds later than the segment's start time.
  EXPECT_EQ(1430006410, f->last_modified());
--- a/src/mp4.cc
+++ b/src/mp4.cc
@ -106,7 +106,7 @@ const size_t kSubtitleLength = strlen("2015-07-02 17:10:00 -0700");
 // that causes the different bytes to be output for a particular set of
 // Mp4Builder options. Incrementing this value will cause the etag to change
 // as well.
-const char kFormatVersion[] = {0x00};
+const char kFormatVersion[] = {0x01};

 // ISO/IEC 14496-12 section 4.3, ftyp.
 const char kFtypBox[] = {
@ -293,6 +293,18 @@ struct TrackHeaderBoxVersion0 {  // ISO/IEC 14496-12 section 8.3.2, tkhd.
  uint32_t height = NET_UINT32_C(0);
 } __attribute__((packed));

+struct EditBox {  // ISO/IEC 14496-12 section 8.6.5, edts.
+  uint32_t size = NET_UINT32_C(0);
+  const char type[4] = {'e', 'd', 't', 's'};
+} __attribute__((packed));
+
+struct EditListBoxVersion0 {  // ISO/IEC 14496-12 section 8.6.6, elst.
+  uint32_t size = NET_UINT32_C(0);
+  const char type[4] = {'e', 'l', 's', 't'};
+  const uint32_t version_and_flags = NET_UINT32_C(0);
+  uint32_t entry_count = NET_UINT32_C(0);
+};
+
 struct MediaBox {  // ISO/IEC 14496-12 section 8.4.1, mdia.
  uint32_t size = NET_UINT32_C(0);
  const char type[4] = {'m', 'd', 'i', 'a'};
@ -425,6 +437,8 @@ class ScopedMp4Box {
 //
 // ** trak (video: container for an individual track or stream)
 // *** tkhd (track header, overall information about the track)
+// *** (optional) edts (edit list container)
+// **** elst (an edit list)
 // *** mdia (container for the media information in a track)
 // **** mdhd (media header, overall information about the media)
 // *** minf (media information container)
@ -439,7 +453,7 @@ class ScopedMp4Box {
 // ***** co64 (64-bit chunk offset)
 // ***** stss (sync sample table)
 //
-// ** trak (subtitle: container for an individual track or stream)
+// ** (optional) trak (subtitle: container for an individual track or stream)
 // *** tkhd (track header, overall information about the track)
 // *** mdia (container for the media information in a track)
 // **** mdhd (media header, overall information about the media)
@ -480,9 +494,9 @@ class Mp4File : public VirtualFile {
    uint32_t duration = 0;
    int64_t max_time_90k = 0;
    for (const auto &segment : segments_) {
-      duration += segment->pieces.duration_90k();
+      duration += segment->pieces.end_90k() - segment->rel_start_90k;
      int64_t start_90k =
-          segment->recording.start_time_90k + segment->pieces.start_90k();
+          segment->recording.start_time_90k + segment->rel_start_90k;
      int64_t end_90k =
          segment->recording.start_time_90k + segment->pieces.end_90k();
      int64_t start_ts = start_90k / kTimeUnitsPerSecond;
@ -568,6 +582,7 @@ class Mp4File : public VirtualFile {
        moov_video_trak_tkhd_.header().height =
            NET_UINT32_C(video_sample_entry_.height << 16);
      }
+      MaybeAppendVideoEdts();
      {
        CONSTRUCT_BOX(moov_video_trak_mdia_);
        {
@ -590,6 +605,53 @@ class Mp4File : public VirtualFile {
    }
  }

+  void MaybeAppendVideoEdts() {
+    struct Entry {
+      Entry(int32_t segment_duration, int32_t media_time)
+          : segment_duration(segment_duration), media_time(media_time) {}
+      int32_t segment_duration = 0;
+      int32_t media_time = 0;
+      int32_t end() const { return segment_duration + media_time; }
+    };
+    std::vector<Entry> entries;
+    int64_t cur_media_time = 0;
+    for (const auto &segment : segments_) {
+      auto skip = segment->rel_start_90k - segment->pieces.start_90k();
+      auto keep = segment->pieces.end_90k() - segment->rel_start_90k;
+      DCHECK_GE(skip, 0);
+      DCHECK_GT(keep, 0);
+      cur_media_time += skip;
+      if (!entries.empty() && entries.back().end() == cur_media_time) {
+        entries.back().segment_duration += keep;
+      } else {
+        entries.emplace_back(keep, cur_media_time);
+      }
+      DCHECK_GT(segment->pieces.duration_90k(), 0);
+      cur_media_time += keep;
+    }
+    if (entries.size() == 1 && entries[0].media_time == 0) {
+      return;  // use implicit one-to-one mapping.
+    }
+
+    VLOG(1) << "Using edit list with " << entries.size() << " entries.";
+    std::string *s = &moov_video_trak_edts_elst_entries_str_;
+    for (const auto &entry : entries) {
+      VLOG(2) << "...duration=" << entry.segment_duration
+              << ", time=" << entry.media_time;
+      AppendU32(entry.segment_duration, s);
+      AppendU32(entry.media_time, s);
+      AppendU16(1, s);  // media_rate_integer
+      AppendU16(1, s);  // media_rate_fraction
+    }
+    CONSTRUCT_BOX(moov_video_trak_edts_);
+    CONSTRUCT_BOX(moov_video_trak_edts_elst_);
+    moov_video_trak_edts_elst_.header().entry_count =
+        ToNetworkU32(entries.size());
+    moov_video_trak_edts_elst_entries_.Init(
+        moov_video_trak_edts_elst_entries_str_);
+    slices_.Append(&moov_video_trak_edts_elst_entries_);
+  }
+
  void AppendVideoStbl() {
    CONSTRUCT_BOX(moov_video_trak_mdia_minf_stbl_);
    {
@ -763,7 +825,7 @@ class Mp4File : public VirtualFile {
    std::string &s = moov_subtitle_trak_mdia_minf_stbl_stts_entries_str_;
    for (const auto &segment : segments_) {
      int64_t start_90k =
-          segment->recording.start_time_90k + segment->pieces.start_90k();
+          segment->recording.start_time_90k + segment->rel_start_90k;
      int64_t end_90k =
          segment->recording.start_time_90k + segment->pieces.end_90k();
      int64_t start_next_90k =
@ -802,7 +864,7 @@ class Mp4File : public VirtualFile {
    memset(&mytm, 0, sizeof(mytm));
    for (const auto &segment : segments_) {
      int64_t start_90k =
-          segment->recording.start_time_90k + segment->pieces.start_90k();
+          segment->recording.start_time_90k + segment->rel_start_90k;
      int64_t end_90k =
          segment->recording.start_time_90k + segment->pieces.end_90k();
      int64_t start_ts = start_90k / kTimeUnitsPerSecond;
@ -839,6 +901,10 @@ class Mp4File : public VirtualFile {

  Mp4Box<TrackBox> moov_video_trak_;
  Mp4Box<TrackHeaderBoxVersion0> moov_video_trak_tkhd_;
+  Mp4Box<EditBox> moov_video_trak_edts_;
+  Mp4Box<EditListBoxVersion0> moov_video_trak_edts_elst_;
+  StringPieceSlice moov_video_trak_edts_elst_entries_;
+  std::string moov_video_trak_edts_elst_entries_str_;
  Mp4Box<MediaBox> moov_video_trak_mdia_;
  Mp4Box<MediaHeaderBoxVersion0> moov_video_trak_mdia_mdhd_;
  StringPieceSlice moov_video_trak_mdia_hdlr_;
@ -906,6 +972,7 @@ bool Mp4SampleTablePieces::Init(const Recording *recording,
    key_frames_ = recording->video_sync_samples;
    actual_end_90k_ = recording_duration_90k;
  } else {
+    VLOG(1) << "Slow path.";
    if (!it.done() && !it.is_key()) {
      *error_message = "First frame must be a key frame.";
      return false;
@ -941,6 +1008,7 @@ bool Mp4SampleTablePieces::Init(const Recording *recording,
    *error_message = it.error();
    return false;
  }
+  actual_end_90k_ = std::min(actual_end_90k_, desired_end_90k_);
  VLOG(1) << "requested ts [" << start_90k << ", " << end_90k << "), got ts ["
          << begin_.start_90k() << ", " << actual_end_90k_ << "), " << frames_
          << " frames (" << key_frames_
@ -967,7 +1035,17 @@ bool Mp4SampleTablePieces::FillSttsEntries(std::string *s,
  for (it = begin_; !it.done() && it.start_90k() < desired_end_90k_;
       it.Next()) {
    AppendU32(1, s);
-    AppendU32(it.duration_90k(), s);
+
+    // The final sample may be shortened to the desired end.
+    if (it.end_90k() > desired_end_90k_) {
+      auto new_duration = desired_end_90k_ - it.start_90k();
+      VLOG(1) << "Shortening final sample duration from " << it.duration_90k()
+              << " to " << new_duration;
+      AppendU32(new_duration, s);
+      break;
+    } else {
+      AppendU32(it.duration_90k(), s);
+    }
  }
  if (it.has_error()) {
    *error_message = it.error();
--- a/src/mp4.h
+++ b/src/mp4.h
@ -66,10 +66,10 @@ class Mp4SampleTablePieces {
  // samples() values.
  //
  // |start_90k| and |end_90k| should be relative to the start of the recording.
-  // They indicate the *desired* time range. The *actual* time range will be
-  // from the last sync sample <= |start_90k| to the last sample with start time
-  // <= |end_90k|. TODO: support edit lists and duration trimming to produce
-  // the exact correct time range.
+  // They indicate the *desired* time range. The *actual* time range will
+  // start at the last sync sample <= |start_90k|. (The caller is responsible
+  // for creating an edit list to skip the undesired portion.) It will end at
+  // the desired range, or the end of the recording, whichever is sooner.
  bool Init(const Recording *recording, int sample_entry_index,
            int32_t sample_offset, int32_t start_90k, int32_t end_90k,
            std::string *error_message);
@ -88,8 +88,8 @@ class Mp4SampleTablePieces {
  // Return the byte range in the sample file of the frames represented here.
  ByteRange sample_pos() const { return sample_pos_; }

+  // As described above, these may differ from the desired range.
  uint64_t duration_90k() const { return actual_end_90k_ - begin_.start_90k(); }
-
  int32_t start_90k() const { return begin_.start_90k(); }
  int32_t end_90k() const { return actual_end_90k_; }

@ -121,7 +121,15 @@ struct Mp4FileSegment {
  Recording recording;
  Mp4SampleTablePieces pieces;
  RealFileSlice sample_file_slice;
+
+  // Requested start time, relative to recording.start_90k.
+  // If there is no key frame at exactly this position, |pieces| will actually
+  // start sooner, and an edit list should be used to skip the undesired
+  // prefix.
  int32_t rel_start_90k = 0;
+
+  // Requested end time, relative to recording.end_90k.
+  // This will be clamped to the actual duration of the recording.
  int32_t rel_end_90k = std::numeric_limits<int32_t>::max();
 };

--- a/src/web.cc
+++ b/src/web.cc
@ -375,14 +375,15 @@ std::shared_ptr<VirtualFile> WebInterface::BuildMp4(
  bool ok = true;
  auto row_cb = [&](Recording &recording,
                    const VideoSampleEntry &sample_entry) {
-    if (rows == 0 && recording.start_time_90k != next_row_start_time_90k) {
+    if (rows == 0 && recording.start_time_90k > next_row_start_time_90k) {
      *error_message = StrCat(
          "recording starts late: ", PrettyTimestamp(recording.start_time_90k),
          " (", recording.start_time_90k, ") rather than requested: ",
          PrettyTimestamp(start_time_90k), " (", start_time_90k, ")");
      ok = false;
      return IterationControl::kBreak;
-    } else if (recording.start_time_90k != next_row_start_time_90k) {
+    } else if (rows > 0 &&
+               recording.start_time_90k != next_row_start_time_90k) {
      *error_message = StrCat("gap/overlap in recording: ",
                              PrettyTimestamp(next_row_start_time_90k), " (",
                              next_row_start_time_90k, ") to: ",
@ -405,10 +406,15 @@ std::shared_ptr<VirtualFile> WebInterface::BuildMp4(
      builder.SetSampleEntry(sample_entry);
    }

-    // TODO: correct bounds within recording.
-    // Currently this can return too much data.
-    builder.Append(std::move(recording), 0,
-                   std::numeric_limits<int32_t>::max());
+    int32_t rel_start_90k = 0;
+    int32_t rel_end_90k = std::numeric_limits<int32_t>::max();
+    if (recording.start_time_90k < start_time_90k) {
+      rel_start_90k = start_time_90k - recording.start_time_90k;
+    }
+    if (recording.end_time_90k > end_time_90k) {
+      rel_end_90k = end_time_90k - recording.start_time_90k;
+    }
+    builder.Append(std::move(recording), rel_start_90k, rel_end_90k);
    ++rows;
    return IterationControl::kContinue;
  };
@ -421,7 +427,7 @@ std::shared_ptr<VirtualFile> WebInterface::BuildMp4(
    *error_message = StrCat("no recordings in range");
    return false;
  }
-  if (next_row_start_time_90k != end_time_90k) {
+  if (next_row_start_time_90k < end_time_90k) {
    *error_message = StrCat("recording ends early: ",
                            PrettyTimestamp(next_row_start_time_90k), " (",
                            next_row_start_time_90k, "), not requested: ",