// This file is part of Moonfire NVR, a security camera network video recorder. // Copyright (C) 2021 The Moonfire NVR Authors; see AUTHORS and LICENSE.txt. // SPDX-License-Identifier: GPL-v3.0-or-later WITH GPL-3.0-linking-exception. //! H.264 decoding //! //! For the most part, Moonfire NVR does not try to understand the video codec. However, H.264 has //! two byte stream encodings: ISO/IEC 14496-10 Annex B, and ISO/IEC 14496-15 AVC access units. //! When streaming from RTSP, ffmpeg supplies the former. We need the latter to stick into `.mp4` //! files. This file manages the conversion, both for the ffmpeg "extra data" (which should become //! the ISO/IEC 14496-15 section 5.2.4.1 `AVCDecoderConfigurationRecord`) and the actual samples. //! //! See the [wiki page on standards and //! specifications](https://github.com/scottlamb/moonfire-nvr/wiki/Standards-and-specifications) //! for help finding a copy of the relevant standards. This code won't make much sense without them! //! //! ffmpeg of course has logic to do the same thing, but unfortunately it is not exposed except //! through ffmpeg's own generated `.mp4` file. Extracting just this part of their `.mp4` files //! would be more trouble than it's worth. use byteorder::{BigEndian, ByteOrder, WriteBytesExt}; use failure::{bail, format_err, Error}; use std::convert::TryFrom; // See ISO/IEC 14496-10 table 7-1 - NAL unit type codes, syntax element categories, and NAL unit // type classes. const NAL_UNIT_SEQ_PARAMETER_SET: u8 = 7; const NAL_UNIT_PIC_PARAMETER_SET: u8 = 8; const NAL_UNIT_TYPE_MASK: u8 = 0x1F; // bottom 5 bits of first byte of unit. // For certain common sub stream anamorphic resolutions, add a pixel aspect ratio box. // Assume the camera is 16x9. These are just the standard wide mode; default_pixel_aspect_ratio // tries the transpose also. const PIXEL_ASPECT_RATIOS: [((u16, u16), (u16, u16)); 6] = [ ((320, 240), (4, 3)), ((352, 240), (40, 33)), ((640, 352), (44, 45)), ((640, 480), (4, 3)), ((704, 480), (40, 33)), ((720, 480), (32, 27)), ]; /// Get the pixel aspect ratio to use if none is specified. /// /// The Dahua IPC-HDW5231R-Z sets the aspect ratio in the H.264 SPS (correctly) for both square and /// non-square pixels. The Hikvision DS-2CD2032-I doesn't set it, even though the sub stream's /// pixels aren't square. So define a default based on the pixel dimensions to use if the camera /// doesn't tell us what to do. /// /// Note that at least in the case of .mp4 muxing, we don't need to fix up the underlying SPS. /// SPS; PixelAspectRatioBox's definition says that it overrides the H.264-level declaration. fn default_pixel_aspect_ratio(width: u16, height: u16) -> (u16, u16) { if width >= height { PIXEL_ASPECT_RATIOS .iter() .find(|r| r.0 == (width, height)) .map(|r| r.1) .unwrap_or((1, 1)) } else { PIXEL_ASPECT_RATIOS .iter() .find(|r| r.0 == (height, width)) .map(|r| (r.1 .1, r.1 .0)) .unwrap_or((1, 1)) } } /// Decodes a H.264 Annex B byte stream into NAL units. Calls `f` for each NAL unit in the byte /// stream. Aborts if `f` returns error. /// /// Note `f` is called with the encoded NAL form, not the RBSP. The NAL header byte and any /// emulation prevention bytes will be present. /// /// See ISO/IEC 14496-10 section B.2: Byte stream NAL unit decoding process. /// This is a relatively simple, unoptimized implementation. /// /// TODO: detect invalid byte streams. For example, several 0x00s not followed by a 0x01, a stream /// stream not starting with 0x00 0x00 0x00 0x01, or an empty NAL unit. fn decode_h264_annex_b<'a, F>(mut data: &'a [u8], mut f: F) -> Result<(), Error> where F: FnMut(&'a [u8]) -> Result<(), Error>, { let start_code = &b"\x00\x00\x01"[..]; use nom::FindSubstring; 'outer: while let Some(pos) = data.find_substring(start_code) { let mut unit = &data[0..pos]; data = &data[pos + start_code.len()..]; // Have zero or more bytes that end in a start code. Strip out any trailing 0x00s and // process the unit if there's anything left. loop { match unit.last() { None => continue 'outer, Some(b) if *b == 0 => { unit = &unit[..unit.len() - 1]; } Some(_) => break, } } f(unit)?; } // No remaining start codes; likely a unit left. if !data.is_empty() { f(data)?; } Ok(()) } /// Parses Annex B extra data, returning a tuple holding the `sps` and `pps` substrings. fn parse_annex_b_extra_data(data: &[u8]) -> Result<(&[u8], &[u8]), Error> { let mut sps = None; let mut pps = None; decode_h264_annex_b(data, |unit| { let nal_type = (unit[0] as u8) & NAL_UNIT_TYPE_MASK; match nal_type { NAL_UNIT_SEQ_PARAMETER_SET => sps = Some(unit), NAL_UNIT_PIC_PARAMETER_SET => pps = Some(unit), _ => bail!("Expected SPS and PPS; got type {}", nal_type), }; Ok(()) })?; match (sps, pps) { (Some(s), Some(p)) => Ok((s, p)), _ => bail!("SPS and PPS must be specified"), } } /// Parsed representation of ffmpeg's "extradata". #[derive(Debug, PartialEq, Eq)] pub struct ExtraData { pub entry: db::VideoSampleEntryToInsert, /// True iff sample data should be transformed from Annex B format to AVC format via a call to /// `transform_sample_data`. (The assumption is that if the extra data was in Annex B format, /// the sample data is also.) pub need_transform: bool, } impl ExtraData { /// Parses "extradata" from ffmpeg. This data may be in either Annex B format or AVC format. pub fn parse(extradata: &[u8], width: u16, height: u16) -> Result { let raw_sps_and_pps; let need_transform; let ctx; let sps_owner; let sps; // reference to either within ctx or to sps_owner. if extradata.starts_with(b"\x00\x00\x00\x01") || extradata.starts_with(b"\x00\x00\x01") { // ffmpeg supplied "extradata" in Annex B format. let (s, p) = parse_annex_b_extra_data(extradata)?; let rbsp = h264_reader::rbsp::decode_nal(&s[1..]); sps_owner = h264_reader::nal::sps::SeqParameterSet::from_bytes(&rbsp) .map_err(|e| format_err!("Bad SPS: {:?}", e))?; sps = &sps_owner; raw_sps_and_pps = Some((s, p)); need_transform = true; } else { // Assume "extradata" holds an AVCDecoderConfiguration. need_transform = false; raw_sps_and_pps = None; let avcc = h264_reader::avcc::AvcDecoderConfigurationRecord::try_from(extradata) .map_err(|e| format_err!("Bad AvcDecoderConfigurationRecord: {:?}", e))?; if avcc.num_of_sequence_parameter_sets() != 1 { bail!("Multiple SPSs!"); } ctx = avcc .create_context(()) .map_err(|e| format_err!("Can't load SPS+PPS: {:?}", e))?; sps = ctx .sps_by_id(h264_reader::nal::pps::ParamSetId::from_u32(0).unwrap()) .ok_or_else(|| format_err!("No SPS 0"))?; }; let mut sample_entry = Vec::with_capacity(256); // This is a concatenation of the following boxes/classes. // SampleEntry, ISO/IEC 14496-12 section 8.5.2. let avc1_len_pos = sample_entry.len(); // length placeholder + type + reserved + data_reference_index = 1 sample_entry.extend_from_slice(b"\x00\x00\x00\x00avc1\x00\x00\x00\x00\x00\x00\x00\x01"); // VisualSampleEntry, ISO/IEC 14496-12 section 12.1.3. sample_entry.extend_from_slice(&[0; 16]); // pre-defined + reserved sample_entry.write_u16::(width)?; sample_entry.write_u16::(height)?; sample_entry.extend_from_slice(&[ 0x00, 0x48, 0x00, 0x00, // horizresolution 0x00, 0x48, 0x00, 0x00, // vertresolution 0x00, 0x00, 0x00, 0x00, // reserved 0x00, 0x01, // frame count 0x00, 0x00, 0x00, 0x00, // compressorname 0x00, 0x00, 0x00, 0x00, // 0x00, 0x00, 0x00, 0x00, // 0x00, 0x00, 0x00, 0x00, // 0x00, 0x00, 0x00, 0x00, // 0x00, 0x00, 0x00, 0x00, // 0x00, 0x00, 0x00, 0x00, // 0x00, 0x00, 0x00, 0x00, // 0x00, 0x18, 0xff, 0xff, // depth + pre_defined ]); // AVCSampleEntry, ISO/IEC 14496-15 section 5.3.4.1. // AVCConfigurationBox, ISO/IEC 14496-15 section 5.3.4.1. let avcc_len_pos = sample_entry.len(); sample_entry.extend_from_slice(b"\x00\x00\x00\x00avcC"); if let Some((sps, pps)) = raw_sps_and_pps { // Create the AVCDecoderConfiguration, ISO/IEC 14496-15 section 5.2.4.1. // The beginning of the AVCDecoderConfiguration takes a few values from // the SPS (ISO/IEC 14496-10 section 7.3.2.1.1). One caveat: that section // defines the syntax in terms of RBSP, not NAL. The difference is the // escaping of 00 00 01 and 00 00 02; see notes about // "emulation_prevention_three_byte" in ISO/IEC 14496-10 section 7.4. // It looks like 00 is not a valid value of profile_idc, so this distinction // shouldn't be relevant here. And ffmpeg seems to ignore it. sample_entry.push(1); // configurationVersion sample_entry.push(sps[1]); // profile_idc . AVCProfileIndication sample_entry.push(sps[2]); // ...misc bits... . profile_compatibility sample_entry.push(sps[3]); // level_idc . AVCLevelIndication // Hardcode lengthSizeMinusOne to 3, matching TransformSampleData's 4-byte // lengths. sample_entry.push(0xff); // Only support one SPS and PPS. // ffmpeg's ff_isom_write_avcc has the same limitation, so it's probably // fine. This next byte is a reserved 0b111 + a 5-bit # of SPSs (1). sample_entry.push(0xe1); sample_entry.write_u16::(u16::try_from(sps.len())?)?; sample_entry.extend_from_slice(sps); sample_entry.push(1); // # of PPSs. sample_entry.write_u16::(u16::try_from(pps.len())?)?; sample_entry.extend_from_slice(pps); } else { sample_entry.extend_from_slice(extradata); }; // Fix up avc1 and avcC box lengths. let cur_pos = sample_entry.len(); BigEndian::write_u32( &mut sample_entry[avcc_len_pos..avcc_len_pos + 4], u32::try_from(cur_pos - avcc_len_pos)?, ); // PixelAspectRatioBox, ISO/IEC 14496-12 section 12.1.4.2. // Write a PixelAspectRatioBox if necessary, as the sub streams can be be anamorphic. let pasp = sps .vui_parameters .as_ref() .and_then(|v| v.aspect_ratio_info.as_ref()) .and_then(|a| a.clone().get()) .unwrap_or_else(|| default_pixel_aspect_ratio(width, height)); if pasp != (1, 1) { sample_entry.extend_from_slice(b"\x00\x00\x00\x10pasp"); // length + box name sample_entry.write_u32::(pasp.0.into())?; sample_entry.write_u32::(pasp.1.into())?; } let cur_pos = sample_entry.len(); BigEndian::write_u32( &mut sample_entry[avc1_len_pos..avc1_len_pos + 4], u32::try_from(cur_pos - avc1_len_pos)?, ); let profile_idc = sample_entry[103]; let constraint_flags = sample_entry[104]; let level_idc = sample_entry[105]; let rfc6381_codec = format!( "avc1.{:02x}{:02x}{:02x}", profile_idc, constraint_flags, level_idc ); Ok(ExtraData { entry: db::VideoSampleEntryToInsert { data: sample_entry, rfc6381_codec, width, height, pasp_h_spacing: pasp.0, pasp_v_spacing: pasp.1, }, need_transform, }) } } /// Transforms sample data from Annex B format to AVC format. Should be called on samples iff /// `ExtraData::need_transform` is true. Uses an out parameter `avc_sample` rather than a return /// so that memory allocations can be reused from sample to sample. pub fn transform_sample_data(annexb_sample: &[u8], avc_sample: &mut Vec) -> Result<(), Error> { // See AVCParameterSamples, ISO/IEC 14496-15 section 5.3.2. avc_sample.clear(); // The output will be about as long as the input. Annex B stop codes require at least three // bytes; many seem to be four. The output lengths are exactly four. avc_sample.reserve(annexb_sample.len() + 4); decode_h264_annex_b(annexb_sample, |unit| { // 4-byte length; this must match ParseExtraData's lengthSizeMinusOne == 3. avc_sample.write_u32::(unit.len() as u32)?; // length avc_sample.extend_from_slice(unit); Ok(()) })?; Ok(()) } #[cfg(test)] mod tests { use db::testutil; #[rustfmt::skip] const ANNEX_B_TEST_INPUT: [u8; 35] = [ 0x00, 0x00, 0x00, 0x01, 0x67, 0x4d, 0x00, 0x1f, 0x9a, 0x66, 0x02, 0x80, 0x2d, 0xff, 0x35, 0x01, 0x01, 0x01, 0x40, 0x00, 0x00, 0xfa, 0x00, 0x00, 0x1d, 0x4c, 0x01, 0x00, 0x00, 0x00, 0x01, 0x68, 0xee, 0x3c, 0x80, ]; #[rustfmt::skip] const AVC_DECODER_CONFIG_TEST_INPUT: [u8; 38] = [ 0x01, 0x4d, 0x00, 0x1f, 0xff, 0xe1, 0x00, 0x17, 0x67, 0x4d, 0x00, 0x1f, 0x9a, 0x66, 0x02, 0x80, 0x2d, 0xff, 0x35, 0x01, 0x01, 0x01, 0x40, 0x00, 0x00, 0xfa, 0x00, 0x00, 0x1d, 0x4c, 0x01, 0x01, 0x00, 0x04, 0x68, 0xee, 0x3c, 0x80, ]; #[rustfmt::skip] const TEST_OUTPUT: [u8; 132] = [ 0x00, 0x00, 0x00, 0x84, 0x61, 0x76, 0x63, 0x31, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x02, 0xd0, 0x00, 0x48, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0xff, 0xff, 0x00, 0x00, 0x00, 0x2e, 0x61, 0x76, 0x63, 0x43, 0x01, 0x4d, 0x00, 0x1f, 0xff, 0xe1, 0x00, 0x17, 0x67, 0x4d, 0x00, 0x1f, 0x9a, 0x66, 0x02, 0x80, 0x2d, 0xff, 0x35, 0x01, 0x01, 0x01, 0x40, 0x00, 0x00, 0xfa, 0x00, 0x00, 0x1d, 0x4c, 0x01, 0x01, 0x00, 0x04, 0x68, 0xee, 0x3c, 0x80, ]; #[test] fn test_decode() { testutil::init(); let data = &ANNEX_B_TEST_INPUT; let mut pieces = Vec::new(); super::decode_h264_annex_b(data, |p| { pieces.push(p); Ok(()) }) .unwrap(); assert_eq!(&pieces, &[&data[4..27], &data[31..]]); } #[test] fn test_sample_entry_from_avc_decoder_config() { testutil::init(); let e = super::ExtraData::parse(&AVC_DECODER_CONFIG_TEST_INPUT, 1280, 720).unwrap(); assert_eq!(&e.entry.data[..], &TEST_OUTPUT[..]); assert_eq!(e.entry.width, 1280); assert_eq!(e.entry.height, 720); assert_eq!(e.entry.rfc6381_codec, "avc1.4d001f"); assert_eq!(e.need_transform, false); } #[test] fn test_sample_entry_from_annex_b() { testutil::init(); let e = super::ExtraData::parse(&ANNEX_B_TEST_INPUT, 1280, 720).unwrap(); assert_eq!(e.entry.width, 1280); assert_eq!(e.entry.height, 720); assert_eq!(e.entry.rfc6381_codec, "avc1.4d001f"); assert_eq!(e.need_transform, true); } #[test] fn test_transform_sample_data() { testutil::init(); #[rustfmt::skip] const INPUT: [u8; 64] = [ 0x00, 0x00, 0x00, 0x01, 0x67, 0x4d, 0x00, 0x1f, 0x9a, 0x66, 0x02, 0x80, 0x2d, 0xff, 0x35, 0x01, 0x01, 0x01, 0x40, 0x00, 0x00, 0xfa, 0x00, 0x00, 0x1d, 0x4c, 0x01, 0x00, 0x00, 0x00, 0x01, 0x68, 0xee, 0x3c, 0x80, 0x00, 0x00, 0x00, 0x01, 0x06, 0x06, 0x01, 0xc4, 0x80, 0x00, 0x00, 0x00, 0x01, 0x65, 0x88, 0x80, 0x10, 0x00, 0x08, 0x7f, 0x00, 0x5d, 0x27, 0xb5, 0xc1, 0xff, 0x8c, 0xd6, 0x35, // (truncated) ]; #[rustfmt::skip] const EXPECTED_OUTPUT: [u8; 64] = [ 0x00, 0x00, 0x00, 0x17, 0x67, 0x4d, 0x00, 0x1f, 0x9a, 0x66, 0x02, 0x80, 0x2d, 0xff, 0x35, 0x01, 0x01, 0x01, 0x40, 0x00, 0x00, 0xfa, 0x00, 0x00, 0x1d, 0x4c, 0x01, 0x00, 0x00, 0x00, 0x04, 0x68, 0xee, 0x3c, 0x80, 0x00, 0x00, 0x00, 0x05, 0x06, 0x06, 0x01, 0xc4, 0x80, 0x00, 0x00, 0x00, 0x10, 0x65, 0x88, 0x80, 0x10, 0x00, 0x08, 0x7f, 0x00, 0x5d, 0x27, 0xb5, 0xc1, 0xff, 0x8c, 0xd6, 0x35, ]; let mut out = Vec::new(); super::transform_sample_data(&INPUT, &mut out).unwrap(); assert_eq!(&out[..], &EXPECTED_OUTPUT[..]); } #[test] fn pixel_aspect_ratios() { use super::default_pixel_aspect_ratio; use num_rational::Ratio; for &((w, h), _) in &super::PIXEL_ASPECT_RATIOS { let (h_spacing, v_spacing) = default_pixel_aspect_ratio(w, h); assert_eq!(Ratio::new(w * h_spacing, h * v_spacing), Ratio::new(16, 9)); // 90 or 270 degree rotation. let (h_spacing, v_spacing) = default_pixel_aspect_ratio(h, w); assert_eq!(Ratio::new(h * h_spacing, w * v_spacing), Ratio::new(9, 16)); } } }