From 3030e3fb321ae0a6cbc4aaa4b4b542cc475da1a6 Mon Sep 17 00:00:00 2001 From: Scott Lamb Date: Sun, 1 May 2016 09:16:14 -0700 Subject: [PATCH] Support trimming video segments. * If the end of a segment is between samples, the last included sample will have a shortened duration. * If the beginning of a segment not on a key frame (aka sync sample), the prefix will be included but trimmed using an edit list. (It seems like a ctts box might be able to accomplish the same thing, fwiw.) --- src/mp4-test.cc | 59 ++++++++++++++++++++++--------- src/mp4.cc | 92 +++++++++++++++++++++++++++++++++++++++++++++---- src/mp4.h | 18 +++++++--- src/web.cc | 20 +++++++---- 4 files changed, 153 insertions(+), 36 deletions(-) diff --git a/src/mp4-test.cc b/src/mp4-test.cc index 7f10d60..0b5612a 100644 --- a/src/mp4-test.cc +++ b/src/mp4-test.cc @@ -256,11 +256,11 @@ class IntegrationTest : public testing::Test { } std::shared_ptr CreateMp4FromSingleRecording( - const Recording &recording, bool include_ts) { + const Recording &recording, int32_t rel_start_90k, int32_t rel_end_90k, + bool include_ts) { Mp4FileBuilder builder(tmpdir_.get()); builder.SetSampleEntry(video_sample_entry_); - builder.Append(Recording(recording), 0, - std::numeric_limits::max()); + builder.Append(Recording(recording), rel_start_90k, rel_end_90k); builder.include_timestamp_subtitle_track(include_ts); std::string error_message; auto mp4 = builder.Build(&error_message); @@ -280,7 +280,7 @@ class IntegrationTest : public testing::Test { WriteFileOrDie(StrCat(tmpdir_path_, "/clip.new.mp4"), &buf); } - void CompareMp4s() { + void CompareMp4s(int64_t pts_offset) { std::string error_message; auto original = GetRealVideoSource()->OpenFile("../src/testdata/clip.mp4", &error_message); @@ -294,22 +294,33 @@ class IntegrationTest : public testing::Test { EXPECT_EQ(original->stream()->codec->height, copied->stream()->codec->height); + int pkt = 0; while (true) { VideoPacket original_pkt; VideoPacket copied_pkt; bool original_has_next = original->GetNext(&original_pkt, &error_message); - ASSERT_TRUE(original_has_next || error_message.empty()) << error_message; + ASSERT_TRUE(original_has_next || error_message.empty()) + << "pkt " << pkt << ": " << error_message; bool copied_has_next = copied->GetNext(&copied_pkt, &error_message); - ASSERT_TRUE(copied_has_next || error_message.empty()) << error_message; + ASSERT_TRUE(copied_has_next || error_message.empty()) + << "pkt " << pkt << ": " << error_message; if (!original_has_next && !copied_has_next) { break; } - ASSERT_TRUE(original_has_next); - ASSERT_TRUE(copied_has_next); - EXPECT_EQ(original_pkt.pkt()->pts, copied_pkt.pkt()->pts); - EXPECT_EQ(original_pkt.pkt()->duration, copied_pkt.pkt()->duration); - EXPECT_EQ(GetData(original_pkt), GetData(copied_pkt)); + ASSERT_TRUE(original_has_next) << "pkt " << pkt; + ASSERT_TRUE(copied_has_next) << "pkt " << pkt; + EXPECT_EQ(original_pkt.pkt()->pts + pts_offset, copied_pkt.pkt()->pts) + << "pkt " << pkt; + + // One would normally expect the duration to be exactly the same, but + // when using an edit list, ffmpeg appears to extend the last packet's + // duration by the amount skipped at the beginning. I think this is a + // bug on their side. + EXPECT_LE(original_pkt.pkt()->duration, copied_pkt.pkt()->duration) + << "pkt " << pkt; + EXPECT_EQ(GetData(original_pkt), GetData(copied_pkt)) << "pkt " << pkt; + ++pkt; } } @@ -329,9 +340,10 @@ TEST_F(IntegrationTest, RoundTrip) { if (HasFailure()) { return; } - auto f = CreateMp4FromSingleRecording(recording, false); + auto f = CreateMp4FromSingleRecording( + recording, 0, std::numeric_limits::max(), false); WriteMp4(f.get()); - CompareMp4s(); + CompareMp4s(0); } TEST_F(IntegrationTest, RoundTripWithSubtitle) { @@ -339,9 +351,21 @@ TEST_F(IntegrationTest, RoundTripWithSubtitle) { if (HasFailure()) { return; } - auto f = CreateMp4FromSingleRecording(recording, true); + auto f = CreateMp4FromSingleRecording( + recording, 0, std::numeric_limits::max(), true); WriteMp4(f.get()); - CompareMp4s(); + CompareMp4s(0); +} + +TEST_F(IntegrationTest, RoundTripWithEditList) { + Recording recording = CopyMp4ToSingleRecording(); + if (HasFailure()) { + return; + } + auto f = CreateMp4FromSingleRecording( + recording, 1, std::numeric_limits::max(), false); + WriteMp4(f.get()); + CompareMp4s(-1); } TEST_F(IntegrationTest, Metadata) { @@ -349,14 +373,15 @@ TEST_F(IntegrationTest, Metadata) { if (HasFailure()) { return; } - auto f = CreateMp4FromSingleRecording(recording, false); + auto f = CreateMp4FromSingleRecording( + recording, 0, std::numeric_limits::max(), false); // This test is brittle, which is the point. Any time the digest comparison // here fails, it can be updated, but the etag must change as well! // Otherwise clients may combine ranges from the new format with ranges // from the old format! EXPECT_EQ("1e5331e8371bd97ac3158b3a86494abc87cdc70e", Digest(f.get())); - EXPECT_EQ("\"62f5e00a6e1e6dd893add217b1bf7ed7446b8b9d\"", f->etag()); + EXPECT_EQ("\"a9ce99a83a177516e7f862fed405e2b47b7d4ae3\"", f->etag()); // 10 seconds later than the segment's start time. EXPECT_EQ(1430006410, f->last_modified()); diff --git a/src/mp4.cc b/src/mp4.cc index 40ce2ee..c70f360 100644 --- a/src/mp4.cc +++ b/src/mp4.cc @@ -106,7 +106,7 @@ const size_t kSubtitleLength = strlen("2015-07-02 17:10:00 -0700"); // that causes the different bytes to be output for a particular set of // Mp4Builder options. Incrementing this value will cause the etag to change // as well. -const char kFormatVersion[] = {0x00}; +const char kFormatVersion[] = {0x01}; // ISO/IEC 14496-12 section 4.3, ftyp. const char kFtypBox[] = { @@ -293,6 +293,18 @@ struct TrackHeaderBoxVersion0 { // ISO/IEC 14496-12 section 8.3.2, tkhd. uint32_t height = NET_UINT32_C(0); } __attribute__((packed)); +struct EditBox { // ISO/IEC 14496-12 section 8.6.5, edts. + uint32_t size = NET_UINT32_C(0); + const char type[4] = {'e', 'd', 't', 's'}; +} __attribute__((packed)); + +struct EditListBoxVersion0 { // ISO/IEC 14496-12 section 8.6.6, elst. + uint32_t size = NET_UINT32_C(0); + const char type[4] = {'e', 'l', 's', 't'}; + const uint32_t version_and_flags = NET_UINT32_C(0); + uint32_t entry_count = NET_UINT32_C(0); +}; + struct MediaBox { // ISO/IEC 14496-12 section 8.4.1, mdia. uint32_t size = NET_UINT32_C(0); const char type[4] = {'m', 'd', 'i', 'a'}; @@ -425,6 +437,8 @@ class ScopedMp4Box { // // ** trak (video: container for an individual track or stream) // *** tkhd (track header, overall information about the track) +// *** (optional) edts (edit list container) +// **** elst (an edit list) // *** mdia (container for the media information in a track) // **** mdhd (media header, overall information about the media) // *** minf (media information container) @@ -439,7 +453,7 @@ class ScopedMp4Box { // ***** co64 (64-bit chunk offset) // ***** stss (sync sample table) // -// ** trak (subtitle: container for an individual track or stream) +// ** (optional) trak (subtitle: container for an individual track or stream) // *** tkhd (track header, overall information about the track) // *** mdia (container for the media information in a track) // **** mdhd (media header, overall information about the media) @@ -480,9 +494,9 @@ class Mp4File : public VirtualFile { uint32_t duration = 0; int64_t max_time_90k = 0; for (const auto &segment : segments_) { - duration += segment->pieces.duration_90k(); + duration += segment->pieces.end_90k() - segment->rel_start_90k; int64_t start_90k = - segment->recording.start_time_90k + segment->pieces.start_90k(); + segment->recording.start_time_90k + segment->rel_start_90k; int64_t end_90k = segment->recording.start_time_90k + segment->pieces.end_90k(); int64_t start_ts = start_90k / kTimeUnitsPerSecond; @@ -568,6 +582,7 @@ class Mp4File : public VirtualFile { moov_video_trak_tkhd_.header().height = NET_UINT32_C(video_sample_entry_.height << 16); } + MaybeAppendVideoEdts(); { CONSTRUCT_BOX(moov_video_trak_mdia_); { @@ -590,6 +605,53 @@ class Mp4File : public VirtualFile { } } + void MaybeAppendVideoEdts() { + struct Entry { + Entry(int32_t segment_duration, int32_t media_time) + : segment_duration(segment_duration), media_time(media_time) {} + int32_t segment_duration = 0; + int32_t media_time = 0; + int32_t end() const { return segment_duration + media_time; } + }; + std::vector entries; + int64_t cur_media_time = 0; + for (const auto &segment : segments_) { + auto skip = segment->rel_start_90k - segment->pieces.start_90k(); + auto keep = segment->pieces.end_90k() - segment->rel_start_90k; + DCHECK_GE(skip, 0); + DCHECK_GT(keep, 0); + cur_media_time += skip; + if (!entries.empty() && entries.back().end() == cur_media_time) { + entries.back().segment_duration += keep; + } else { + entries.emplace_back(keep, cur_media_time); + } + DCHECK_GT(segment->pieces.duration_90k(), 0); + cur_media_time += keep; + } + if (entries.size() == 1 && entries[0].media_time == 0) { + return; // use implicit one-to-one mapping. + } + + VLOG(1) << "Using edit list with " << entries.size() << " entries."; + std::string *s = &moov_video_trak_edts_elst_entries_str_; + for (const auto &entry : entries) { + VLOG(2) << "...duration=" << entry.segment_duration + << ", time=" << entry.media_time; + AppendU32(entry.segment_duration, s); + AppendU32(entry.media_time, s); + AppendU16(1, s); // media_rate_integer + AppendU16(1, s); // media_rate_fraction + } + CONSTRUCT_BOX(moov_video_trak_edts_); + CONSTRUCT_BOX(moov_video_trak_edts_elst_); + moov_video_trak_edts_elst_.header().entry_count = + ToNetworkU32(entries.size()); + moov_video_trak_edts_elst_entries_.Init( + moov_video_trak_edts_elst_entries_str_); + slices_.Append(&moov_video_trak_edts_elst_entries_); + } + void AppendVideoStbl() { CONSTRUCT_BOX(moov_video_trak_mdia_minf_stbl_); { @@ -763,7 +825,7 @@ class Mp4File : public VirtualFile { std::string &s = moov_subtitle_trak_mdia_minf_stbl_stts_entries_str_; for (const auto &segment : segments_) { int64_t start_90k = - segment->recording.start_time_90k + segment->pieces.start_90k(); + segment->recording.start_time_90k + segment->rel_start_90k; int64_t end_90k = segment->recording.start_time_90k + segment->pieces.end_90k(); int64_t start_next_90k = @@ -802,7 +864,7 @@ class Mp4File : public VirtualFile { memset(&mytm, 0, sizeof(mytm)); for (const auto &segment : segments_) { int64_t start_90k = - segment->recording.start_time_90k + segment->pieces.start_90k(); + segment->recording.start_time_90k + segment->rel_start_90k; int64_t end_90k = segment->recording.start_time_90k + segment->pieces.end_90k(); int64_t start_ts = start_90k / kTimeUnitsPerSecond; @@ -839,6 +901,10 @@ class Mp4File : public VirtualFile { Mp4Box moov_video_trak_; Mp4Box moov_video_trak_tkhd_; + Mp4Box moov_video_trak_edts_; + Mp4Box moov_video_trak_edts_elst_; + StringPieceSlice moov_video_trak_edts_elst_entries_; + std::string moov_video_trak_edts_elst_entries_str_; Mp4Box moov_video_trak_mdia_; Mp4Box moov_video_trak_mdia_mdhd_; StringPieceSlice moov_video_trak_mdia_hdlr_; @@ -906,6 +972,7 @@ bool Mp4SampleTablePieces::Init(const Recording *recording, key_frames_ = recording->video_sync_samples; actual_end_90k_ = recording_duration_90k; } else { + VLOG(1) << "Slow path."; if (!it.done() && !it.is_key()) { *error_message = "First frame must be a key frame."; return false; @@ -941,6 +1008,7 @@ bool Mp4SampleTablePieces::Init(const Recording *recording, *error_message = it.error(); return false; } + actual_end_90k_ = std::min(actual_end_90k_, desired_end_90k_); VLOG(1) << "requested ts [" << start_90k << ", " << end_90k << "), got ts [" << begin_.start_90k() << ", " << actual_end_90k_ << "), " << frames_ << " frames (" << key_frames_ @@ -967,7 +1035,17 @@ bool Mp4SampleTablePieces::FillSttsEntries(std::string *s, for (it = begin_; !it.done() && it.start_90k() < desired_end_90k_; it.Next()) { AppendU32(1, s); - AppendU32(it.duration_90k(), s); + + // The final sample may be shortened to the desired end. + if (it.end_90k() > desired_end_90k_) { + auto new_duration = desired_end_90k_ - it.start_90k(); + VLOG(1) << "Shortening final sample duration from " << it.duration_90k() + << " to " << new_duration; + AppendU32(new_duration, s); + break; + } else { + AppendU32(it.duration_90k(), s); + } } if (it.has_error()) { *error_message = it.error(); diff --git a/src/mp4.h b/src/mp4.h index b0b8b94..56d557b 100644 --- a/src/mp4.h +++ b/src/mp4.h @@ -66,10 +66,10 @@ class Mp4SampleTablePieces { // samples() values. // // |start_90k| and |end_90k| should be relative to the start of the recording. - // They indicate the *desired* time range. The *actual* time range will be - // from the last sync sample <= |start_90k| to the last sample with start time - // <= |end_90k|. TODO: support edit lists and duration trimming to produce - // the exact correct time range. + // They indicate the *desired* time range. The *actual* time range will + // start at the last sync sample <= |start_90k|. (The caller is responsible + // for creating an edit list to skip the undesired portion.) It will end at + // the desired range, or the end of the recording, whichever is sooner. bool Init(const Recording *recording, int sample_entry_index, int32_t sample_offset, int32_t start_90k, int32_t end_90k, std::string *error_message); @@ -88,8 +88,8 @@ class Mp4SampleTablePieces { // Return the byte range in the sample file of the frames represented here. ByteRange sample_pos() const { return sample_pos_; } + // As described above, these may differ from the desired range. uint64_t duration_90k() const { return actual_end_90k_ - begin_.start_90k(); } - int32_t start_90k() const { return begin_.start_90k(); } int32_t end_90k() const { return actual_end_90k_; } @@ -121,7 +121,15 @@ struct Mp4FileSegment { Recording recording; Mp4SampleTablePieces pieces; RealFileSlice sample_file_slice; + + // Requested start time, relative to recording.start_90k. + // If there is no key frame at exactly this position, |pieces| will actually + // start sooner, and an edit list should be used to skip the undesired + // prefix. int32_t rel_start_90k = 0; + + // Requested end time, relative to recording.end_90k. + // This will be clamped to the actual duration of the recording. int32_t rel_end_90k = std::numeric_limits::max(); }; diff --git a/src/web.cc b/src/web.cc index 0d25912..7eb1091 100644 --- a/src/web.cc +++ b/src/web.cc @@ -375,14 +375,15 @@ std::shared_ptr WebInterface::BuildMp4( bool ok = true; auto row_cb = [&](Recording &recording, const VideoSampleEntry &sample_entry) { - if (rows == 0 && recording.start_time_90k != next_row_start_time_90k) { + if (rows == 0 && recording.start_time_90k > next_row_start_time_90k) { *error_message = StrCat( "recording starts late: ", PrettyTimestamp(recording.start_time_90k), " (", recording.start_time_90k, ") rather than requested: ", PrettyTimestamp(start_time_90k), " (", start_time_90k, ")"); ok = false; return IterationControl::kBreak; - } else if (recording.start_time_90k != next_row_start_time_90k) { + } else if (rows > 0 && + recording.start_time_90k != next_row_start_time_90k) { *error_message = StrCat("gap/overlap in recording: ", PrettyTimestamp(next_row_start_time_90k), " (", next_row_start_time_90k, ") to: ", @@ -405,10 +406,15 @@ std::shared_ptr WebInterface::BuildMp4( builder.SetSampleEntry(sample_entry); } - // TODO: correct bounds within recording. - // Currently this can return too much data. - builder.Append(std::move(recording), 0, - std::numeric_limits::max()); + int32_t rel_start_90k = 0; + int32_t rel_end_90k = std::numeric_limits::max(); + if (recording.start_time_90k < start_time_90k) { + rel_start_90k = start_time_90k - recording.start_time_90k; + } + if (recording.end_time_90k > end_time_90k) { + rel_end_90k = end_time_90k - recording.start_time_90k; + } + builder.Append(std::move(recording), rel_start_90k, rel_end_90k); ++rows; return IterationControl::kContinue; }; @@ -421,7 +427,7 @@ std::shared_ptr WebInterface::BuildMp4( *error_message = StrCat("no recordings in range"); return false; } - if (next_row_start_time_90k != end_time_90k) { + if (next_row_start_time_90k < end_time_90k) { *error_message = StrCat("recording ends early: ", PrettyTimestamp(next_row_start_time_90k), " (", next_row_start_time_90k, "), not requested: ",