Skip to content

Commit 3dbb6b9

Browse files
committed
Matroska: scan for duration when not provided by metadata
Streamable files generally do not provide Duration via the header. In order to support figuring out the duration of these files, we can scan clusters from the start and the end of the file and give an estimate that way. Preserve parse_whole_file and hide_clusters functionality.
1 parent edafafd commit 3dbb6b9

File tree

1 file changed

+142
-10
lines changed

1 file changed

+142
-10
lines changed

getid3/module.audio-video.matroska.php

Lines changed: 142 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,11 @@
210210
define('EBML_ID_CLUSTERREFERENCEVIRTUAL', 0x7D); // [FD] -- Relative position of the data that should be in position of the virtual block.
211211

212212

213+
/**
214+
* Matroska constants
215+
*/
216+
define('MATROSKA_DEFAULT_TIMECODESCALE', 1000000);
217+
213218
/**
214219
* @tutorial http://www.matroska.org/technical/specs/index.html
215220
*
@@ -241,6 +246,7 @@ class getid3_matroska extends getid3_handler
241246
private $EBMLbuffer_length = 0;
242247
private $current_offset = 0;
243248
private $unuseful_elements = array(EBML_ID_CRC32, EBML_ID_VOID);
249+
private $parse_first_cluster = false;
244250

245251
/**
246252
* @return bool
@@ -256,14 +262,25 @@ public function Analyze()
256262
$this->error('EBML parser: '.$e->getMessage());
257263
}
258264

259-
// calculate playtime
260-
if (isset($info['matroska']['info']) && is_array($info['matroska']['info'])) {
261-
foreach ($info['matroska']['info'] as $key => $infoarray) {
262-
if (isset($infoarray['Duration'])) {
263-
// TimecodeScale is how many nanoseconds each Duration unit is
264-
$info['playtime_seconds'] = $infoarray['Duration'] * ((isset($infoarray['TimecodeScale']) ? $infoarray['TimecodeScale'] : 1000000) / 1000000000);
265-
break;
266-
}
265+
$this->calculatePlaytimeFromMetadata($info);
266+
267+
// If there was no duration metadata, this might be an incomplete file or a streaming file
268+
// We need Cluster information so we can use their timecodes to estimate playtime.
269+
if (!isset($info['playtime_seconds']) && !$this->parse_whole_file) {
270+
// If we have not yet scanned the entire file, scan the start and end for Clusters,
271+
$this->scanStartEndForClusters($info);
272+
}
273+
274+
if (isset($info['matroska']['cluster']) && is_array($info['matroska']['cluster'])) {
275+
if (!isset($info['playtime_seconds']) && !empty($info['matroska']['cluster'])) {
276+
// estimate playtime using clusters if not yet known
277+
$this->calculatePlaytimeFromClusters($info);
278+
}
279+
280+
// Remove cluster information from output if hide_clusters is true
281+
// These could have been set during scanStartEndForClusters()
282+
if ($this->hide_clusters) {
283+
unset($info['matroska']['cluster']);
267284
}
268285
}
269286

@@ -1246,8 +1263,13 @@ private function parseEBML(&$info) {
12461263
}
12471264
$this->current_offset = $subelement['end'];
12481265
}
1249-
if (!$this->hide_clusters) {
1250-
$info['matroska']['cluster'][] = $cluster_entry;
1266+
// Always store clusters internally (for duration calculation)
1267+
// They will be removed from output later if hide_clusters is true
1268+
$info['matroska']['cluster'][] = $cluster_entry;
1269+
1270+
// Stop parsing after finding first cluster
1271+
if ($this->parse_first_cluster) {
1272+
return;
12511273
}
12521274

12531275
// check to see if all the data we need exists already, if so, break out of the loop
@@ -1919,4 +1941,114 @@ private static function getDefaultStreamInfo($streams)
19191941
return $info;
19201942
}
19211943

1944+
/**
1945+
* @param array $info
1946+
*
1947+
* @return bool True if duration was set from metadata
1948+
*/
1949+
private function calculatePlaytimeFromMetadata(&$info) {
1950+
if (isset($info['matroska']['info']) && is_array($info['matroska']['info'])) {
1951+
foreach ($info['matroska']['info'] as $infoarray) {
1952+
if (isset($infoarray['Duration'])) {
1953+
// TimecodeScale is how many nanoseconds each Duration unit is
1954+
$info['playtime_seconds'] = $infoarray['Duration'] * ((isset($infoarray['TimecodeScale']) ? $infoarray['TimecodeScale'] : MATROSKA_DEFAULT_TIMECODESCALE) / 1000000000);
1955+
return true;
1956+
}
1957+
}
1958+
}
1959+
return false;
1960+
}
1961+
1962+
/**
1963+
* @param int $offset New starting offset for the buffer
1964+
*
1965+
* @return void
1966+
*/
1967+
private function resetParserBuffer($offset) {
1968+
$this->current_offset = $offset;
1969+
$this->EBMLbuffer = '';
1970+
$this->EBMLbuffer_offset = 0;
1971+
$this->EBMLbuffer_length = 0;
1972+
}
1973+
1974+
/**
1975+
* Scan start and end of file for cluster information when Duration is missing
1976+
* Only use this if no Duration was found in the Info element and we are not in parse_whole_file mode
1977+
*
1978+
* @param array $info
1979+
*
1980+
* @return void
1981+
*/
1982+
private function scanStartEndForClusters(&$info) {
1983+
$this->resetParserBuffer($info['avdataoffset']);
1984+
1985+
// we need to temporarily override parse_whole_file to be able to scan clusters
1986+
$this->parse_whole_file = true;
1987+
$this->parse_first_cluster = true;
1988+
try {
1989+
$this->parseEBML($info);
1990+
} catch (Exception $e) {
1991+
$this->error('EBML parser (start of file): '.$e->getMessage());
1992+
}
1993+
$this->parse_first_cluster = false;
1994+
1995+
// Scan end of file for last cluster
1996+
if (is_array($info['matroska']['cluster']) && !empty($info['matroska']['cluster'])) {
1997+
// maximum 1MB scan window before EOF
1998+
$this->resetParserBuffer(max(0, $info['avdataend'] - (1024 * 1024)));
1999+
try {
2000+
$this->parseEBML($info);
2001+
} catch (Exception $e) {
2002+
$this->error('EBML parser (end of file): '.$e->getMessage());
2003+
}
2004+
}
2005+
$this->parse_whole_file = false;
2006+
}
2007+
2008+
/**
2009+
* Fetch TimecodeScale from Info element
2010+
*
2011+
* @param array $info
2012+
*
2013+
* @return int TimecodeScale value
2014+
*/
2015+
private function getTimecodeScale(&$info) {
2016+
$timecodeScale = MATROSKA_DEFAULT_TIMECODESCALE;
2017+
if (isset($info['matroska']['info']) && is_array($info['matroska']['info'])) {
2018+
foreach ($info['matroska']['info'] as $infoarray) {
2019+
if (isset($infoarray['TimecodeScale'])) {
2020+
$timecodeScale = $infoarray['TimecodeScale'];
2021+
break;
2022+
}
2023+
}
2024+
}
2025+
return $timecodeScale;
2026+
}
2027+
2028+
/**
2029+
* Calculate duration from scanned cluster timecodes
2030+
*
2031+
* @param array $info
2032+
*
2033+
* @return void
2034+
*/
2035+
private function calculatePlaytimeFromClusters(&$info) {
2036+
$minTimecode = null;
2037+
$maxTimecode = null;
2038+
if (isset($info['matroska']['cluster']) && is_array($info['matroska']['cluster'])) {
2039+
foreach ($info['matroska']['cluster'] as $cluster) {
2040+
if (isset($cluster['ClusterTimecode'])) {
2041+
if ($minTimecode === null || $cluster['ClusterTimecode'] < $minTimecode) {
2042+
$minTimecode = $cluster['ClusterTimecode'];
2043+
}
2044+
if ($maxTimecode === null || $cluster['ClusterTimecode'] > $maxTimecode) {
2045+
$maxTimecode = $cluster['ClusterTimecode'];
2046+
}
2047+
}
2048+
}
2049+
}
2050+
if ($maxTimecode !== null && $minTimecode !== null && $maxTimecode > $minTimecode) {
2051+
$info['playtime_seconds'] = ($maxTimecode - $minTimecode) * ($this->getTimecodeScale($info) / 1000000000);
2052+
}
2053+
}
19222054
}

0 commit comments

Comments
 (0)