diff --git a/MetadataExtractor.Tests/IO/StreamingIndexedCapturingReaderTest.cs b/MetadataExtractor.Tests/IO/StreamingIndexedCapturingReaderTest.cs new file mode 100644 index 000000000..1178405a2 --- /dev/null +++ b/MetadataExtractor.Tests/IO/StreamingIndexedCapturingReaderTest.cs @@ -0,0 +1,107 @@ +#region License +// +// Copyright 2002-2017 Drew Noakes +// Ported from Java to C# by Yakov Danilov for Imazen LLC in 2014 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// More information about this project is available at: +// +// https://github.com/drewnoakes/metadata-extractor-dotnet +// https://drewnoakes.com/code/exif/ +// +#endregion + +using System; +using System.IO; +using MetadataExtractor.IO; +using Xunit; + +namespace MetadataExtractor.Tests.IO +{ + /// + /// Wrapper for MemoryStream that allows us to configure whether it is + /// seekable, and whether or not the Length property is supported. + /// + public class ConfigurableMemoryStream : MemoryStream + { + private bool _seekable; + private bool _lengthSupported; + + public ConfigurableMemoryStream(byte[] buffer, bool seekable, bool lengthSupported) : base(buffer) + { + _seekable = seekable; + _lengthSupported = lengthSupported; + } + + public override long Length + { + get + { + if (_lengthSupported) + { + return base.Length; + } + else + { + throw new NotSupportedException("Length property was disabled"); + } + } + } + + public override bool CanSeek => _seekable; + } + + /// Unit tests for . + /// Drew Noakes https://drewnoakes.com + public abstract class StreamingIndexedCapturingReaderTestBase : IndexedReaderTestBase + { + [Fact] + public void ConstructWithNullBufferThrows() + { + // ReSharper disable once AssignNullToNotNullAttribute + Assert.Throws(() => new IndexedCapturingReader(null)); + } + } + + // Since the normal IndexedCapturingReaderTest uses MemoryStream, which both + // supports the Length property and is seekable, the following classes test + // the remaining permutations of options: + // + // * non-seekable, has length + // * seekable, doesn't have length + // * non-seekable, doesn't have length + public sealed class NonSeekableLengthSupportedIndexedCapturingReaderTest : StreamingIndexedCapturingReaderTestBase + { + protected override IndexedReader CreateReader(params byte[] bytes) + { + return new IndexedCapturingReader(new ConfigurableMemoryStream(bytes, false, true)); + } + } + + public sealed class SeekableLengthUnsupportedIndexedCapturingReaderTest : StreamingIndexedCapturingReaderTestBase + { + protected override IndexedReader CreateReader(params byte[] bytes) + { + return new IndexedCapturingReader(new ConfigurableMemoryStream(bytes, true, false)); + } + } + + public sealed class NonSeekableLengthUnsupportedIndexedCapturingReaderTest : StreamingIndexedCapturingReaderTestBase + { + protected override IndexedReader CreateReader(params byte[] bytes) + { + return new IndexedCapturingReader(new ConfigurableMemoryStream(bytes, false, false)); + } + } +} diff --git a/MetadataExtractor/IO/IndexedCapturingReader.cs b/MetadataExtractor/IO/IndexedCapturingReader.cs index d517b5d1d..a48c85b26 100644 --- a/MetadataExtractor/IO/IndexedCapturingReader.cs +++ b/MetadataExtractor/IO/IndexedCapturingReader.cs @@ -33,15 +33,15 @@ namespace MetadataExtractor.IO /// Drew Noakes https://drewnoakes.com public sealed class IndexedCapturingReader : IndexedReader { - private const int DefaultChunkLength = 2 * 1024; + private const int DefaultChunkLength = 4 * 1024; [NotNull] private readonly Stream _stream; private readonly int _chunkLength; - private readonly List _chunks = new List(); - private bool _isStreamFinished; - private int _streamLength; - private bool _streamLengthThrewException; + private readonly Dictionary _chunks; + private int _maxChunkLoaded = -1; + private int _streamLength = -1; + private readonly bool _contiguousBufferMode; public IndexedCapturingReader([NotNull] Stream stream, int chunkLength = DefaultChunkLength, bool isMotorolaByteOrder = true) : base(isMotorolaByteOrder) @@ -51,6 +51,40 @@ public IndexedCapturingReader([NotNull] Stream stream, int chunkLength = Default _chunkLength = chunkLength; _stream = stream ?? throw new ArgumentNullException(nameof(stream)); + + try + { + // For some reason, FileStreams are faster in contiguous mode. Since this is such a a commont case, we + // specifically check for it. + if (_stream is FileStream) + { + _contiguousBufferMode = true; + } + else + { + // If the stream is both seekable and has a length, switch to non-contiguous buffering mode. This + // will use Seek operations to access data that is far beyond the reach of what has been buffered, + // rather than reading the entire file into memory in this case. + _contiguousBufferMode = !(_stream.Length > 0 && _stream.CanSeek); + } + } + catch (NotSupportedException) + { + // Streams that don't support the Length property have to be handled in contiguous mode. + _contiguousBufferMode = true; + } + + if (!_contiguousBufferMode) + { + // If we know the length of the stream ahead of time, we can allocate a Dictionary with enough slots + // for all the chunks. We 2X it to try to avoid hash collisions. + var chunksCapacity = 2 * (_stream.Length / chunkLength); + _chunks = new Dictionary((int) chunksCapacity); + } + else + { + _chunks = new Dictionary(); + } } /// @@ -68,21 +102,16 @@ public override long Length { get { - if (!_streamLengthThrewException) + if (_contiguousBufferMode) { - try + if (_streamLength == -1) { - return _stream.Length; - } - catch (NotSupportedException) - { - _streamLengthThrewException = true; + IsValidIndex(int.MaxValue, 1); } + return _streamLength; } - IsValidIndex(int.MaxValue, 1); - Debug.Assert(_isStreamFinished); - return _streamLength; + return _stream.Length; } } @@ -103,59 +132,159 @@ protected override void ValidateIndex(int index, int bytesRequested) if ((long)index + bytesRequested - 1 > int.MaxValue) throw new BufferBoundsException($"Number of requested bytes summed with starting index exceed maximum range of signed 32 bit integers (requested index: {index}, requested count: {bytesRequested})"); - Debug.Assert(_isStreamFinished); // TODO test that can continue using an instance of this type after this exception - throw new BufferBoundsException(ToUnshiftedOffset(index), bytesRequested, _streamLength); + throw new BufferBoundsException(ToUnshiftedOffset(index), bytesRequested, Length); } } - protected override bool IsValidIndex(int index, int bytesRequested) + /// + /// Helper method for GetChunk. This will load the next chunk of data from the input stream. If non contiguous + /// buffering mode is being used, this method relies on the called (GetChunk) to set the stream's position + /// correctly. In contiguous buffer mode, this will simply be the next chunk in sequence (the stream's Position + /// field will just advance monotonically). + /// + /// + private byte[] LoadNextChunk() { - if (index < 0 || bytesRequested < 0) - return false; + var chunk = new byte[_chunkLength]; + var totalBytesRead = 0; - var endIndexLong = (long)index + bytesRequested - 1; - if (endIndexLong > int.MaxValue) - return false; + while (totalBytesRead != _chunkLength) + { + var bytesRead = _stream.Read(chunk, totalBytesRead, _chunkLength - totalBytesRead); + totalBytesRead += bytesRead; + + // if no bytes were read at all, we've reached the end of the file. + if (bytesRead == 0 && totalBytesRead == 0) + { + return null; + } + + // If this read didn't produce any bytes, but a previous read did, we've hit the end of the file, so + // shrink the chunk down to the number of bytes we actually have. + if (bytesRead == 0) + { + var shrunkChunk = new byte[totalBytesRead]; + Buffer.BlockCopy(chunk, 0, shrunkChunk, 0, totalBytesRead); + return shrunkChunk; + } + } - var endIndex = (int)endIndexLong; - if (_isStreamFinished) - return endIndex < _streamLength; + return chunk; + } - var chunkIndex = endIndex / _chunkLength; + // GetChunk is substantially slower for random accesses owing to needing to use a Dictionary, rather than a + // List. However, the typical access pattern isn't very random at all -- you generally read a whole series of + // bytes from the same chunk. So we just cache the last chunk that was read and return that directly if it's + // requested again. This is about 15% faster than going straight to the Dictionary. + private int _lastChunkIdx = -1; + private byte[] _lastChunkData = null; - while (chunkIndex >= _chunks.Count) + /// + /// Load the data for the given chunk (if necessary), and return it. Chunks are identified by their index, + /// which is their start offset divided by the chunk length. eg: offset 10 will typically refer to chunk + /// index 0. See DoGetChunk() for implementation -- this function adds simple memoization. + /// + /// The index of the chunk to get + private byte[] GetChunk(int chunkIndex) + { + if (chunkIndex == _lastChunkIdx) { - Debug.Assert(!_isStreamFinished); + return _lastChunkData; + } + + var result = DoGetChunk(chunkIndex); + _lastChunkIdx = chunkIndex; + _lastChunkData = result; - var chunk = new byte[_chunkLength]; - var totalBytesRead = 0; - while (!_isStreamFinished && totalBytesRead != _chunkLength) + return result; + } + + private byte[] DoGetChunk(int chunkIndex) + { + byte[] result; + if (_chunks.TryGetValue(chunkIndex, out result)) + { + return result; + } + + if (!_contiguousBufferMode) + { + var chunkStart = chunkIndex * _chunkLength; + + // Often we will be reading long contiguous blocks, even in non-contiguous mode. Don't issue Seeks in + // that case, so as to avoid unnecessary syscalls. + if (chunkStart != _stream.Position) + { + _stream.Seek(chunkStart, SeekOrigin.Begin); + } + + var nextChunk = LoadNextChunk(); + if (nextChunk != null) { - var bytesRead = _stream.Read(chunk, totalBytesRead, _chunkLength - totalBytesRead); + _chunks[chunkIndex] = nextChunk; + var newStreamLen = (chunkIndex * _chunkLength) + nextChunk.Length; + _streamLength = newStreamLen > _streamLength ? newStreamLen : _streamLength; + } - if (bytesRead == 0) + return nextChunk; + } + + byte[] curChunk = null; + while (_maxChunkLoaded < chunkIndex) + { + var curChunkIdx = _maxChunkLoaded + 1; + curChunk = LoadNextChunk(); + if (curChunk != null) + { + _chunks[curChunkIdx] = curChunk; + if(_streamLength < 0) { - // the stream has ended, which may be ok - _isStreamFinished = true; - _streamLength = _chunks.Count * _chunkLength + totalBytesRead; - // check we have enough bytes for the requested index - if (endIndex >= _streamLength) - { - _chunks.Add(chunk); - return false; - } + // If this is the first chunk we've loaded, then initialize the stream length. + _streamLength = curChunk.Length; } else { - totalBytesRead += bytesRead; + var newStreamLen = _streamLength + curChunk.Length; + _streamLength = newStreamLen > _streamLength ? newStreamLen : _streamLength; } } + else + { + return null; + } + + _maxChunkLoaded = curChunkIdx; + } + + return curChunk; + } + + protected override bool IsValidIndex(int index, int bytesRequested) + { + if (index < 0 || bytesRequested < 0) + return false; + + var endIndexLong = (long)index + bytesRequested - 1; + if (endIndexLong > int.MaxValue) + return false; + + if (!_contiguousBufferMode) + { + return endIndexLong < _stream.Length; + } + + var endIndex = (int)endIndexLong; - _chunks.Add(chunk); + var chunkIndex = endIndex / _chunkLength; + + var endChunk = GetChunk(chunkIndex); + if (endChunk == null) + { + return false; } - return true; + return endChunk.Length > (endIndex % _chunkLength); } public override int ToUnshiftedOffset(int localOffset) => localOffset; @@ -166,7 +295,7 @@ public override byte GetByte(int index) var chunkIndex = index / _chunkLength; var innerIndex = index % _chunkLength; - var chunk = _chunks[chunkIndex]; + var chunk = GetChunk(chunkIndex); return chunk[innerIndex]; } @@ -183,7 +312,7 @@ public override byte[] GetBytes(int index, int count) var fromChunkIndex = fromIndex / _chunkLength; var fromInnerIndex = fromIndex % _chunkLength; var length = Math.Min(remaining, _chunkLength - fromInnerIndex); - var chunk = _chunks[fromChunkIndex]; + var chunk = GetChunk(fromChunkIndex); Array.Copy(chunk, fromInnerIndex, bytes, toIndex, length); remaining -= length; fromIndex += length;