diff --git a/MetadataExtractor.Tests/IO/StreamingIndexedCapturingReaderTest.cs b/MetadataExtractor.Tests/IO/StreamingIndexedCapturingReaderTest.cs
new file mode 100644
index 000000000..1178405a2
--- /dev/null
+++ b/MetadataExtractor.Tests/IO/StreamingIndexedCapturingReaderTest.cs
@@ -0,0 +1,107 @@
+#region License
+//
+// Copyright 2002-2017 Drew Noakes
+// Ported from Java to C# by Yakov Danilov for Imazen LLC in 2014
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// More information about this project is available at:
+//
+// https://github.com/drewnoakes/metadata-extractor-dotnet
+// https://drewnoakes.com/code/exif/
+//
+#endregion
+
+using System;
+using System.IO;
+using MetadataExtractor.IO;
+using Xunit;
+
+namespace MetadataExtractor.Tests.IO
+{
+ ///
+ /// Wrapper for MemoryStream that allows us to configure whether it is
+ /// seekable, and whether or not the Length property is supported.
+ ///
+ public class ConfigurableMemoryStream : MemoryStream
+ {
+ private bool _seekable;
+ private bool _lengthSupported;
+
+ public ConfigurableMemoryStream(byte[] buffer, bool seekable, bool lengthSupported) : base(buffer)
+ {
+ _seekable = seekable;
+ _lengthSupported = lengthSupported;
+ }
+
+ public override long Length
+ {
+ get
+ {
+ if (_lengthSupported)
+ {
+ return base.Length;
+ }
+ else
+ {
+ throw new NotSupportedException("Length property was disabled");
+ }
+ }
+ }
+
+ public override bool CanSeek => _seekable;
+ }
+
+ /// Unit tests for .
+ /// Drew Noakes https://drewnoakes.com
+ public abstract class StreamingIndexedCapturingReaderTestBase : IndexedReaderTestBase
+ {
+ [Fact]
+ public void ConstructWithNullBufferThrows()
+ {
+ // ReSharper disable once AssignNullToNotNullAttribute
+ Assert.Throws(() => new IndexedCapturingReader(null));
+ }
+ }
+
+ // Since the normal IndexedCapturingReaderTest uses MemoryStream, which both
+ // supports the Length property and is seekable, the following classes test
+ // the remaining permutations of options:
+ //
+ // * non-seekable, has length
+ // * seekable, doesn't have length
+ // * non-seekable, doesn't have length
+ public sealed class NonSeekableLengthSupportedIndexedCapturingReaderTest : StreamingIndexedCapturingReaderTestBase
+ {
+ protected override IndexedReader CreateReader(params byte[] bytes)
+ {
+ return new IndexedCapturingReader(new ConfigurableMemoryStream(bytes, false, true));
+ }
+ }
+
+ public sealed class SeekableLengthUnsupportedIndexedCapturingReaderTest : StreamingIndexedCapturingReaderTestBase
+ {
+ protected override IndexedReader CreateReader(params byte[] bytes)
+ {
+ return new IndexedCapturingReader(new ConfigurableMemoryStream(bytes, true, false));
+ }
+ }
+
+ public sealed class NonSeekableLengthUnsupportedIndexedCapturingReaderTest : StreamingIndexedCapturingReaderTestBase
+ {
+ protected override IndexedReader CreateReader(params byte[] bytes)
+ {
+ return new IndexedCapturingReader(new ConfigurableMemoryStream(bytes, false, false));
+ }
+ }
+}
diff --git a/MetadataExtractor/IO/IndexedCapturingReader.cs b/MetadataExtractor/IO/IndexedCapturingReader.cs
index d517b5d1d..a48c85b26 100644
--- a/MetadataExtractor/IO/IndexedCapturingReader.cs
+++ b/MetadataExtractor/IO/IndexedCapturingReader.cs
@@ -33,15 +33,15 @@ namespace MetadataExtractor.IO
/// Drew Noakes https://drewnoakes.com
public sealed class IndexedCapturingReader : IndexedReader
{
- private const int DefaultChunkLength = 2 * 1024;
+ private const int DefaultChunkLength = 4 * 1024;
[NotNull]
private readonly Stream _stream;
private readonly int _chunkLength;
- private readonly List _chunks = new List();
- private bool _isStreamFinished;
- private int _streamLength;
- private bool _streamLengthThrewException;
+ private readonly Dictionary _chunks;
+ private int _maxChunkLoaded = -1;
+ private int _streamLength = -1;
+ private readonly bool _contiguousBufferMode;
public IndexedCapturingReader([NotNull] Stream stream, int chunkLength = DefaultChunkLength, bool isMotorolaByteOrder = true)
: base(isMotorolaByteOrder)
@@ -51,6 +51,40 @@ public IndexedCapturingReader([NotNull] Stream stream, int chunkLength = Default
_chunkLength = chunkLength;
_stream = stream ?? throw new ArgumentNullException(nameof(stream));
+
+ try
+ {
+ // For some reason, FileStreams are faster in contiguous mode. Since this is such a a commont case, we
+ // specifically check for it.
+ if (_stream is FileStream)
+ {
+ _contiguousBufferMode = true;
+ }
+ else
+ {
+ // If the stream is both seekable and has a length, switch to non-contiguous buffering mode. This
+ // will use Seek operations to access data that is far beyond the reach of what has been buffered,
+ // rather than reading the entire file into memory in this case.
+ _contiguousBufferMode = !(_stream.Length > 0 && _stream.CanSeek);
+ }
+ }
+ catch (NotSupportedException)
+ {
+ // Streams that don't support the Length property have to be handled in contiguous mode.
+ _contiguousBufferMode = true;
+ }
+
+ if (!_contiguousBufferMode)
+ {
+ // If we know the length of the stream ahead of time, we can allocate a Dictionary with enough slots
+ // for all the chunks. We 2X it to try to avoid hash collisions.
+ var chunksCapacity = 2 * (_stream.Length / chunkLength);
+ _chunks = new Dictionary((int) chunksCapacity);
+ }
+ else
+ {
+ _chunks = new Dictionary();
+ }
}
///
@@ -68,21 +102,16 @@ public override long Length
{
get
{
- if (!_streamLengthThrewException)
+ if (_contiguousBufferMode)
{
- try
+ if (_streamLength == -1)
{
- return _stream.Length;
- }
- catch (NotSupportedException)
- {
- _streamLengthThrewException = true;
+ IsValidIndex(int.MaxValue, 1);
}
+ return _streamLength;
}
- IsValidIndex(int.MaxValue, 1);
- Debug.Assert(_isStreamFinished);
- return _streamLength;
+ return _stream.Length;
}
}
@@ -103,59 +132,159 @@ protected override void ValidateIndex(int index, int bytesRequested)
if ((long)index + bytesRequested - 1 > int.MaxValue)
throw new BufferBoundsException($"Number of requested bytes summed with starting index exceed maximum range of signed 32 bit integers (requested index: {index}, requested count: {bytesRequested})");
- Debug.Assert(_isStreamFinished);
// TODO test that can continue using an instance of this type after this exception
- throw new BufferBoundsException(ToUnshiftedOffset(index), bytesRequested, _streamLength);
+ throw new BufferBoundsException(ToUnshiftedOffset(index), bytesRequested, Length);
}
}
- protected override bool IsValidIndex(int index, int bytesRequested)
+ ///
+ /// Helper method for GetChunk. This will load the next chunk of data from the input stream. If non contiguous
+ /// buffering mode is being used, this method relies on the called (GetChunk) to set the stream's position
+ /// correctly. In contiguous buffer mode, this will simply be the next chunk in sequence (the stream's Position
+ /// field will just advance monotonically).
+ ///
+ ///
+ private byte[] LoadNextChunk()
{
- if (index < 0 || bytesRequested < 0)
- return false;
+ var chunk = new byte[_chunkLength];
+ var totalBytesRead = 0;
- var endIndexLong = (long)index + bytesRequested - 1;
- if (endIndexLong > int.MaxValue)
- return false;
+ while (totalBytesRead != _chunkLength)
+ {
+ var bytesRead = _stream.Read(chunk, totalBytesRead, _chunkLength - totalBytesRead);
+ totalBytesRead += bytesRead;
+
+ // if no bytes were read at all, we've reached the end of the file.
+ if (bytesRead == 0 && totalBytesRead == 0)
+ {
+ return null;
+ }
+
+ // If this read didn't produce any bytes, but a previous read did, we've hit the end of the file, so
+ // shrink the chunk down to the number of bytes we actually have.
+ if (bytesRead == 0)
+ {
+ var shrunkChunk = new byte[totalBytesRead];
+ Buffer.BlockCopy(chunk, 0, shrunkChunk, 0, totalBytesRead);
+ return shrunkChunk;
+ }
+ }
- var endIndex = (int)endIndexLong;
- if (_isStreamFinished)
- return endIndex < _streamLength;
+ return chunk;
+ }
- var chunkIndex = endIndex / _chunkLength;
+ // GetChunk is substantially slower for random accesses owing to needing to use a Dictionary, rather than a
+ // List. However, the typical access pattern isn't very random at all -- you generally read a whole series of
+ // bytes from the same chunk. So we just cache the last chunk that was read and return that directly if it's
+ // requested again. This is about 15% faster than going straight to the Dictionary.
+ private int _lastChunkIdx = -1;
+ private byte[] _lastChunkData = null;
- while (chunkIndex >= _chunks.Count)
+ ///
+ /// Load the data for the given chunk (if necessary), and return it. Chunks are identified by their index,
+ /// which is their start offset divided by the chunk length. eg: offset 10 will typically refer to chunk
+ /// index 0. See DoGetChunk() for implementation -- this function adds simple memoization.
+ ///
+ /// The index of the chunk to get
+ private byte[] GetChunk(int chunkIndex)
+ {
+ if (chunkIndex == _lastChunkIdx)
{
- Debug.Assert(!_isStreamFinished);
+ return _lastChunkData;
+ }
+
+ var result = DoGetChunk(chunkIndex);
+ _lastChunkIdx = chunkIndex;
+ _lastChunkData = result;
- var chunk = new byte[_chunkLength];
- var totalBytesRead = 0;
- while (!_isStreamFinished && totalBytesRead != _chunkLength)
+ return result;
+ }
+
+ private byte[] DoGetChunk(int chunkIndex)
+ {
+ byte[] result;
+ if (_chunks.TryGetValue(chunkIndex, out result))
+ {
+ return result;
+ }
+
+ if (!_contiguousBufferMode)
+ {
+ var chunkStart = chunkIndex * _chunkLength;
+
+ // Often we will be reading long contiguous blocks, even in non-contiguous mode. Don't issue Seeks in
+ // that case, so as to avoid unnecessary syscalls.
+ if (chunkStart != _stream.Position)
+ {
+ _stream.Seek(chunkStart, SeekOrigin.Begin);
+ }
+
+ var nextChunk = LoadNextChunk();
+ if (nextChunk != null)
{
- var bytesRead = _stream.Read(chunk, totalBytesRead, _chunkLength - totalBytesRead);
+ _chunks[chunkIndex] = nextChunk;
+ var newStreamLen = (chunkIndex * _chunkLength) + nextChunk.Length;
+ _streamLength = newStreamLen > _streamLength ? newStreamLen : _streamLength;
+ }
- if (bytesRead == 0)
+ return nextChunk;
+ }
+
+ byte[] curChunk = null;
+ while (_maxChunkLoaded < chunkIndex)
+ {
+ var curChunkIdx = _maxChunkLoaded + 1;
+ curChunk = LoadNextChunk();
+ if (curChunk != null)
+ {
+ _chunks[curChunkIdx] = curChunk;
+ if(_streamLength < 0)
{
- // the stream has ended, which may be ok
- _isStreamFinished = true;
- _streamLength = _chunks.Count * _chunkLength + totalBytesRead;
- // check we have enough bytes for the requested index
- if (endIndex >= _streamLength)
- {
- _chunks.Add(chunk);
- return false;
- }
+ // If this is the first chunk we've loaded, then initialize the stream length.
+ _streamLength = curChunk.Length;
}
else
{
- totalBytesRead += bytesRead;
+ var newStreamLen = _streamLength + curChunk.Length;
+ _streamLength = newStreamLen > _streamLength ? newStreamLen : _streamLength;
}
}
+ else
+ {
+ return null;
+ }
+
+ _maxChunkLoaded = curChunkIdx;
+ }
+
+ return curChunk;
+ }
+
+ protected override bool IsValidIndex(int index, int bytesRequested)
+ {
+ if (index < 0 || bytesRequested < 0)
+ return false;
+
+ var endIndexLong = (long)index + bytesRequested - 1;
+ if (endIndexLong > int.MaxValue)
+ return false;
+
+ if (!_contiguousBufferMode)
+ {
+ return endIndexLong < _stream.Length;
+ }
+
+ var endIndex = (int)endIndexLong;
- _chunks.Add(chunk);
+ var chunkIndex = endIndex / _chunkLength;
+
+ var endChunk = GetChunk(chunkIndex);
+ if (endChunk == null)
+ {
+ return false;
}
- return true;
+ return endChunk.Length > (endIndex % _chunkLength);
}
public override int ToUnshiftedOffset(int localOffset) => localOffset;
@@ -166,7 +295,7 @@ public override byte GetByte(int index)
var chunkIndex = index / _chunkLength;
var innerIndex = index % _chunkLength;
- var chunk = _chunks[chunkIndex];
+ var chunk = GetChunk(chunkIndex);
return chunk[innerIndex];
}
@@ -183,7 +312,7 @@ public override byte[] GetBytes(int index, int count)
var fromChunkIndex = fromIndex / _chunkLength;
var fromInnerIndex = fromIndex % _chunkLength;
var length = Math.Min(remaining, _chunkLength - fromInnerIndex);
- var chunk = _chunks[fromChunkIndex];
+ var chunk = GetChunk(fromChunkIndex);
Array.Copy(chunk, fromInnerIndex, bytes, toIndex, length);
remaining -= length;
fromIndex += length;