From 01c6a2fb26c5d946f3b1a4e12beca59f65fb18f3 Mon Sep 17 00:00:00 2001 From: James Gregory Date: Wed, 7 Feb 2018 11:32:47 -0800 Subject: [PATCH 1/5] Query underlying Stream object for Length, but fallback to original mechanism if the stream doesn't support the Length operation. Addresses https://github.com/drewnoakes/metadata-extractor-dotnet/issues/122 by allowing for streamed parsing of metadata. --- MetadataExtractor/IO/IndexedCapturingReader.cs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/MetadataExtractor/IO/IndexedCapturingReader.cs b/MetadataExtractor/IO/IndexedCapturingReader.cs index b319d5ede..be58c5af5 100644 --- a/MetadataExtractor/IO/IndexedCapturingReader.cs +++ b/MetadataExtractor/IO/IndexedCapturingReader.cs @@ -60,9 +60,16 @@ public override long Length { get { - IsValidIndex(int.MaxValue, 1); - Debug.Assert(_isStreamFinished); - return _streamLength; + try + { + return _stream.Length; + } + catch (NotSupportedException) + { + IsValidIndex(int.MaxValue, 1); + Debug.Assert(_isStreamFinished); + return _streamLength; + } } } From b6b61ff3d2b8c13ff0d0e6bac30933c53628167e Mon Sep 17 00:00:00 2001 From: James Gregory Date: Wed, 7 Feb 2018 14:50:09 -0800 Subject: [PATCH 2/5] Cache status of Length being supported on the file stream, per Drew's comment. --- .../IO/IndexedCapturingReader.cs | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/MetadataExtractor/IO/IndexedCapturingReader.cs b/MetadataExtractor/IO/IndexedCapturingReader.cs index be58c5af5..e2b940b35 100644 --- a/MetadataExtractor/IO/IndexedCapturingReader.cs +++ b/MetadataExtractor/IO/IndexedCapturingReader.cs @@ -41,6 +41,7 @@ public sealed class IndexedCapturingReader : IndexedReader private readonly List _chunks = new List(); private bool _isStreamFinished; private int _streamLength; + private bool _streamLengthThrewException = false; public IndexedCapturingReader([NotNull] Stream stream, int chunkLength = DefaultChunkLength, bool isMotorolaByteOrder = true) : base(isMotorolaByteOrder) @@ -60,16 +61,21 @@ public override long Length { get { - try + if (!_streamLengthThrewException) { - return _stream.Length; - } - catch (NotSupportedException) - { - IsValidIndex(int.MaxValue, 1); - Debug.Assert(_isStreamFinished); - return _streamLength; + try + { + return _stream.Length; + } + catch (NotSupportedException) + { + _streamLengthThrewException = true; + } } + + IsValidIndex(int.MaxValue, 1); + Debug.Assert(_isStreamFinished); + return _streamLength; } } From 6f7e2f0dc7c265efe60df3b4846e3b476f970b00 Mon Sep 17 00:00:00 2001 From: James Gregory Date: Thu, 8 Feb 2018 15:24:51 -0800 Subject: [PATCH 3/5] Initial version of IndexedCapturingReader that won't read the entire file unless it's needed. --- .../IO/IndexedCapturingReader.cs | 222 ++++++++++++++---- 1 file changed, 171 insertions(+), 51 deletions(-) diff --git a/MetadataExtractor/IO/IndexedCapturingReader.cs b/MetadataExtractor/IO/IndexedCapturingReader.cs index e2b940b35..386c6976c 100644 --- a/MetadataExtractor/IO/IndexedCapturingReader.cs +++ b/MetadataExtractor/IO/IndexedCapturingReader.cs @@ -33,15 +33,15 @@ namespace MetadataExtractor.IO /// Drew Noakes https://drewnoakes.com public sealed class IndexedCapturingReader : IndexedReader { - private const int DefaultChunkLength = 2 * 1024; + private const int DefaultChunkLength = 4 * 1024; [NotNull] private readonly Stream _stream; private readonly int _chunkLength; - private readonly List _chunks = new List(); - private bool _isStreamFinished; - private int _streamLength; - private bool _streamLengthThrewException = false; + private readonly Dictionary _chunks; + private int _maxChunkLoaded = -1; + private int _streamLength = -1; + private readonly bool _contiguousBufferMode; public IndexedCapturingReader([NotNull] Stream stream, int chunkLength = DefaultChunkLength, bool isMotorolaByteOrder = true) : base(isMotorolaByteOrder) @@ -51,6 +51,40 @@ public IndexedCapturingReader([NotNull] Stream stream, int chunkLength = Default _chunkLength = chunkLength; _stream = stream ?? throw new ArgumentNullException(nameof(stream)); + + try + { + // For some reason, FileStreams are faster in contiguous mode. Since this is such a a commont case, we + // specifically check for it. + if (_stream is FileStream) + { + _contiguousBufferMode = true; + } + else + { + // If the stream is both seekable and has a length, switch to non-contiguous buffering mode. This + // will use Seek operations to access data that is far beyond the reach of what has been buffered, + // rather than reading the entire file into memory in this case. + _contiguousBufferMode = !(_stream.Length > 0 && _stream.CanSeek); + } + } + catch (NotSupportedException) + { + // Streams that don't support the Length property have to be handled in contiguous mode. + _contiguousBufferMode = true; + } + + if (!_contiguousBufferMode) + { + // If we know the length of the stream ahead of time, we can allocate a Dictionary with enough slots + // for all the chunks. We 2X it to try to avoid hash collisions. + var chunksCapacity = 2 * (_stream.Length / chunkLength); + _chunks = new Dictionary((int) chunksCapacity); + } + else + { + _chunks = new Dictionary(); + } } /// Reads to the end of the stream, in order to determine the total number of bytes. @@ -61,21 +95,16 @@ public override long Length { get { - if (!_streamLengthThrewException) + if (_contiguousBufferMode) { - try + if (_streamLength != -1) { - return _stream.Length; - } - catch (NotSupportedException) - { - _streamLengthThrewException = true; + IsValidIndex(int.MaxValue, 1); } + return _streamLength; } - - IsValidIndex(int.MaxValue, 1); - Debug.Assert(_isStreamFinished); - return _streamLength; + + return _stream.Length; } } @@ -96,12 +125,126 @@ protected override void ValidateIndex(int index, int bytesRequested) if ((long)index + bytesRequested - 1 > int.MaxValue) throw new BufferBoundsException($"Number of requested bytes summed with starting index exceed maximum range of signed 32 bit integers (requested index: {index}, requested count: {bytesRequested})"); - Debug.Assert(_isStreamFinished); // TODO test that can continue using an instance of this type after this exception throw new BufferBoundsException(ToUnshiftedOffset(index), bytesRequested, _streamLength); } } + /// + /// Helper method for GetChunk. This will load the next chunk of data from the input stream. If non contiguous + /// buffering mode is being used, this method relies on the called (GetChunk) to set the stream's position + /// correctly. In contiguous buffer mode, this will simply be the next chunk in sequence (the stream's Position + /// field will just advance monotonically). + /// + /// + private byte[] LoadNextChunk() + { + var chunk = new byte[_chunkLength]; + var totalBytesRead = 0; + + while (totalBytesRead != _chunkLength) + { + var bytesRead = _stream.Read(chunk, totalBytesRead, _chunkLength - totalBytesRead); + totalBytesRead += bytesRead; + + // if no bytes were read at all, we've reached the end of the file. + if (bytesRead == 0 && totalBytesRead == 0) + { + return null; + } + + // If this read didn't produce any bytes, but a previous read did, we've hit the end of the file, so + // shrink the chunk down to the number of bytes we actually have. + if (bytesRead == 0) + { + var shrunkChunk = new byte[totalBytesRead]; + Buffer.BlockCopy(chunk, 0, shrunkChunk, 0, totalBytesRead); + return shrunkChunk; + } + } + + return chunk; + } + + // GetChunk is substantially slower for random accesses owing to needing to use a Dictionary, rather than a + // List. However, the typical access pattern isn't very random at all -- you generally read a whole series of + // bytes from the same chunk. So we just cache the last chunk that was read and return that directly if it's + // requested again. This is about 15% faster than going straight to the Dictionary. + private int _lastChunkIdx = -1; + private byte[] _lastChunkData = null; + + /// + /// Load the data for the given chunk (if necessary), and return it. Chunks are identified by their index, + /// which is their start offset divided by the chunk length. eg: offset 10 will typically refer to chunk + /// index 0. See DoGetChunk() for implementation -- this function adds simple memoization. + /// + /// The index of the chunk to get + private byte[] GetChunk(int chunkIndex) + { + if (chunkIndex == _lastChunkIdx) + { + return _lastChunkData; + } + + var result = DoGetChunk(chunkIndex); + _lastChunkIdx = chunkIndex; + _lastChunkData = result; + + return result; + } + + private byte[] DoGetChunk(int chunkIndex) + { + byte[] result; + if (_chunks.TryGetValue(chunkIndex, out result)) + { + return result; + } + + if (!_contiguousBufferMode) + { + var chunkStart = chunkIndex * _chunkLength; + + // Often we will be reading long contiguous blocks, even in non-contiguous mode. Don't issue Seeks in + // that case, so as to avoid unnecessary syscalls. + if (chunkStart != _stream.Position) + { + _stream.Seek(chunkStart, SeekOrigin.Begin); + } + + var nextChunk = LoadNextChunk(); + if (nextChunk != null) + { + _chunks[chunkIndex] = nextChunk; + var newStreamLen = (chunkIndex * _chunkLength) + nextChunk.Length; + _streamLength = newStreamLen > _streamLength ? newStreamLen : _streamLength; + } + + return nextChunk; + } + + byte[] curChunk = null; + while (_maxChunkLoaded < chunkIndex) + { + var curChunkIdx = _maxChunkLoaded + 1; + curChunk = LoadNextChunk(); + if (curChunk != null) + { + _chunks[curChunkIdx] = curChunk; + var newStreamLen = (curChunkIdx * _chunkLength) + curChunk.Length; + _streamLength = newStreamLen > _streamLength ? newStreamLen : _streamLength; + } + else + { + return null; + } + + _maxChunkLoaded = curChunkIdx; + } + + return curChunk; + } + protected override bool IsValidIndex(int index, int bytesRequested) { if (index < 0 || bytesRequested < 0) @@ -111,45 +254,22 @@ protected override bool IsValidIndex(int index, int bytesRequested) if (endIndexLong > int.MaxValue) return false; + if (!_contiguousBufferMode) + { + return endIndexLong < _stream.Length; + } + var endIndex = (int)endIndexLong; - if (_isStreamFinished) - return endIndex < _streamLength; var chunkIndex = endIndex / _chunkLength; - // TODO test loading several chunks for a single request - while (chunkIndex >= _chunks.Count) + var endChunk = GetChunk(chunkIndex); + if (endChunk == null) { - Debug.Assert(!_isStreamFinished); - - var chunk = new byte[_chunkLength]; - var totalBytesRead = 0; - while (!_isStreamFinished && totalBytesRead != _chunkLength) - { - var bytesRead = _stream.Read(chunk, totalBytesRead, _chunkLength - totalBytesRead); - - if (bytesRead == 0) - { - // the stream has ended, which may be ok - _isStreamFinished = true; - _streamLength = _chunks.Count * _chunkLength + totalBytesRead; - // check we have enough bytes for the requested index - if (endIndex >= _streamLength) - { - _chunks.Add(chunk); - return false; - } - } - else - { - totalBytesRead += bytesRead; - } - } - - _chunks.Add(chunk); + return false; } - return true; + return endChunk.Length > (endIndex % _chunkLength); } public override int ToUnshiftedOffset(int localOffset) => localOffset; @@ -160,7 +280,7 @@ public override byte GetByte(int index) var chunkIndex = index / _chunkLength; var innerIndex = index % _chunkLength; - var chunk = _chunks[chunkIndex]; + var chunk = GetChunk(chunkIndex); return chunk[innerIndex]; } @@ -177,7 +297,7 @@ public override byte[] GetBytes(int index, int count) var fromChunkIndex = fromIndex / _chunkLength; var fromInnerIndex = fromIndex % _chunkLength; var length = Math.Min(remaining, _chunkLength - fromInnerIndex); - var chunk = _chunks[fromChunkIndex]; + var chunk = GetChunk(fromChunkIndex); Array.Copy(chunk, fromInnerIndex, bytes, toIndex, length); remaining -= length; fromIndex += length; From 6160e1fbd212612d6718a332b8efcdab04e63867 Mon Sep 17 00:00:00 2001 From: James Gregory Date: Tue, 10 Apr 2018 13:11:37 -0700 Subject: [PATCH 4/5] Add test cases for the streamable IO described in #122. This repeats the IndexedCapturingReaderTests, for each of the permutations of seekable and length supported. --- .../IO/StreamingIndexedCapturingReaderTest.cs | 107 ++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 MetadataExtractor.Tests/IO/StreamingIndexedCapturingReaderTest.cs diff --git a/MetadataExtractor.Tests/IO/StreamingIndexedCapturingReaderTest.cs b/MetadataExtractor.Tests/IO/StreamingIndexedCapturingReaderTest.cs new file mode 100644 index 000000000..1178405a2 --- /dev/null +++ b/MetadataExtractor.Tests/IO/StreamingIndexedCapturingReaderTest.cs @@ -0,0 +1,107 @@ +#region License +// +// Copyright 2002-2017 Drew Noakes +// Ported from Java to C# by Yakov Danilov for Imazen LLC in 2014 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// More information about this project is available at: +// +// https://github.com/drewnoakes/metadata-extractor-dotnet +// https://drewnoakes.com/code/exif/ +// +#endregion + +using System; +using System.IO; +using MetadataExtractor.IO; +using Xunit; + +namespace MetadataExtractor.Tests.IO +{ + /// + /// Wrapper for MemoryStream that allows us to configure whether it is + /// seekable, and whether or not the Length property is supported. + /// + public class ConfigurableMemoryStream : MemoryStream + { + private bool _seekable; + private bool _lengthSupported; + + public ConfigurableMemoryStream(byte[] buffer, bool seekable, bool lengthSupported) : base(buffer) + { + _seekable = seekable; + _lengthSupported = lengthSupported; + } + + public override long Length + { + get + { + if (_lengthSupported) + { + return base.Length; + } + else + { + throw new NotSupportedException("Length property was disabled"); + } + } + } + + public override bool CanSeek => _seekable; + } + + /// Unit tests for . + /// Drew Noakes https://drewnoakes.com + public abstract class StreamingIndexedCapturingReaderTestBase : IndexedReaderTestBase + { + [Fact] + public void ConstructWithNullBufferThrows() + { + // ReSharper disable once AssignNullToNotNullAttribute + Assert.Throws(() => new IndexedCapturingReader(null)); + } + } + + // Since the normal IndexedCapturingReaderTest uses MemoryStream, which both + // supports the Length property and is seekable, the following classes test + // the remaining permutations of options: + // + // * non-seekable, has length + // * seekable, doesn't have length + // * non-seekable, doesn't have length + public sealed class NonSeekableLengthSupportedIndexedCapturingReaderTest : StreamingIndexedCapturingReaderTestBase + { + protected override IndexedReader CreateReader(params byte[] bytes) + { + return new IndexedCapturingReader(new ConfigurableMemoryStream(bytes, false, true)); + } + } + + public sealed class SeekableLengthUnsupportedIndexedCapturingReaderTest : StreamingIndexedCapturingReaderTestBase + { + protected override IndexedReader CreateReader(params byte[] bytes) + { + return new IndexedCapturingReader(new ConfigurableMemoryStream(bytes, true, false)); + } + } + + public sealed class NonSeekableLengthUnsupportedIndexedCapturingReaderTest : StreamingIndexedCapturingReaderTestBase + { + protected override IndexedReader CreateReader(params byte[] bytes) + { + return new IndexedCapturingReader(new ConfigurableMemoryStream(bytes, false, false)); + } + } +} From 626355f62a894bed8b2dbfa1c228c28bb3ed0371 Mon Sep 17 00:00:00 2001 From: James Gregory Date: Tue, 10 Apr 2018 13:12:33 -0700 Subject: [PATCH 5/5] Fixes a couple of issues with length handling that were revealed by the test cases. --- MetadataExtractor/IO/IndexedCapturingReader.cs | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/MetadataExtractor/IO/IndexedCapturingReader.cs b/MetadataExtractor/IO/IndexedCapturingReader.cs index a284210dd..a48c85b26 100644 --- a/MetadataExtractor/IO/IndexedCapturingReader.cs +++ b/MetadataExtractor/IO/IndexedCapturingReader.cs @@ -104,7 +104,7 @@ public override long Length { if (_contiguousBufferMode) { - if (_streamLength != -1) + if (_streamLength == -1) { IsValidIndex(int.MaxValue, 1); } @@ -133,7 +133,7 @@ protected override void ValidateIndex(int index, int bytesRequested) throw new BufferBoundsException($"Number of requested bytes summed with starting index exceed maximum range of signed 32 bit integers (requested index: {index}, requested count: {bytesRequested})"); // TODO test that can continue using an instance of this type after this exception - throw new BufferBoundsException(ToUnshiftedOffset(index), bytesRequested, _streamLength); + throw new BufferBoundsException(ToUnshiftedOffset(index), bytesRequested, Length); } } @@ -238,8 +238,16 @@ private byte[] DoGetChunk(int chunkIndex) if (curChunk != null) { _chunks[curChunkIdx] = curChunk; - var newStreamLen = (curChunkIdx * _chunkLength) + curChunk.Length; - _streamLength = newStreamLen > _streamLength ? newStreamLen : _streamLength; + if(_streamLength < 0) + { + // If this is the first chunk we've loaded, then initialize the stream length. + _streamLength = curChunk.Length; + } + else + { + var newStreamLen = _streamLength + curChunk.Length; + _streamLength = newStreamLen > _streamLength ? newStreamLen : _streamLength; + } } else {