From 01c6a2fb26c5d946f3b1a4e12beca59f65fb18f3 Mon Sep 17 00:00:00 2001
From: James Gregory <james@james.id.au>
Date: Wed, 7 Feb 2018 11:32:47 -0800
Subject: [PATCH 1/5] Query underlying Stream object for Length, but fallback
 to original mechanism if the stream doesn't support the Length operation.
 Addresses https://github.com/drewnoakes/metadata-extractor-dotnet/issues/122
 by allowing for streamed parsing of metadata.

---
 MetadataExtractor/IO/IndexedCapturingReader.cs | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/MetadataExtractor/IO/IndexedCapturingReader.cs b/MetadataExtractor/IO/IndexedCapturingReader.cs
index b319d5ede..be58c5af5 100644
--- a/MetadataExtractor/IO/IndexedCapturingReader.cs
+++ b/MetadataExtractor/IO/IndexedCapturingReader.cs
@@ -60,9 +60,16 @@ public override long Length
         {
             get
             {
-                IsValidIndex(int.MaxValue, 1);
-                Debug.Assert(_isStreamFinished);
-                return _streamLength;
+                try
+                {
+                    return _stream.Length;
+                }
+                catch (NotSupportedException)
+                {
+                    IsValidIndex(int.MaxValue, 1);
+                    Debug.Assert(_isStreamFinished);
+                    return _streamLength;
+                }
             }
         }
 

From b6b61ff3d2b8c13ff0d0e6bac30933c53628167e Mon Sep 17 00:00:00 2001
From: James Gregory <james@james.id.au>
Date: Wed, 7 Feb 2018 14:50:09 -0800
Subject: [PATCH 2/5] Cache status of Length being supported on the file
 stream, per Drew's comment.

---
 .../IO/IndexedCapturingReader.cs              | 22 ++++++++++++-------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/MetadataExtractor/IO/IndexedCapturingReader.cs b/MetadataExtractor/IO/IndexedCapturingReader.cs
index be58c5af5..e2b940b35 100644
--- a/MetadataExtractor/IO/IndexedCapturingReader.cs
+++ b/MetadataExtractor/IO/IndexedCapturingReader.cs
@@ -41,6 +41,7 @@ public sealed class IndexedCapturingReader : IndexedReader
         private readonly List<byte[]> _chunks = new List<byte[]>();
         private bool _isStreamFinished;
         private int _streamLength;
+        private bool _streamLengthThrewException = false;
 
         public IndexedCapturingReader([NotNull] Stream stream, int chunkLength = DefaultChunkLength, bool isMotorolaByteOrder = true)
             : base(isMotorolaByteOrder)
@@ -60,16 +61,21 @@ public override long Length
         {
             get
             {
-                try
+                if (!_streamLengthThrewException)
                 {
-                    return _stream.Length;
-                }
-                catch (NotSupportedException)
-                {
-                    IsValidIndex(int.MaxValue, 1);
-                    Debug.Assert(_isStreamFinished);
-                    return _streamLength;
+                    try
+                    {
+                        return _stream.Length;
+                    }
+                    catch (NotSupportedException)
+                    {
+                        _streamLengthThrewException = true;
+                    }
                 }
+                
+                IsValidIndex(int.MaxValue, 1);
+                Debug.Assert(_isStreamFinished);
+                return _streamLength;
             }
         }
 

From 6f7e2f0dc7c265efe60df3b4846e3b476f970b00 Mon Sep 17 00:00:00 2001
From: James Gregory <james@james.id.au>
Date: Thu, 8 Feb 2018 15:24:51 -0800
Subject: [PATCH 3/5] Initial version of IndexedCapturingReader that won't read
 the entire file unless it's needed.

---
 .../IO/IndexedCapturingReader.cs              | 222 ++++++++++++++----
 1 file changed, 171 insertions(+), 51 deletions(-)

diff --git a/MetadataExtractor/IO/IndexedCapturingReader.cs b/MetadataExtractor/IO/IndexedCapturingReader.cs
index e2b940b35..386c6976c 100644
--- a/MetadataExtractor/IO/IndexedCapturingReader.cs
+++ b/MetadataExtractor/IO/IndexedCapturingReader.cs
@@ -33,15 +33,15 @@ namespace MetadataExtractor.IO
     /// <author>Drew Noakes https://drewnoakes.com</author>
     public sealed class IndexedCapturingReader : IndexedReader
     {
-        private const int DefaultChunkLength = 2 * 1024;
+        private const int DefaultChunkLength = 4 * 1024;
 
         [NotNull]
         private readonly Stream _stream;
         private readonly int _chunkLength;
-        private readonly List<byte[]> _chunks = new List<byte[]>();
-        private bool _isStreamFinished;
-        private int _streamLength;
-        private bool _streamLengthThrewException = false;
+        private readonly Dictionary<int, byte[]> _chunks;
+        private int _maxChunkLoaded = -1;
+        private int _streamLength = -1;
+        private readonly bool _contiguousBufferMode;
 
         public IndexedCapturingReader([NotNull] Stream stream, int chunkLength = DefaultChunkLength, bool isMotorolaByteOrder = true)
             : base(isMotorolaByteOrder)
@@ -51,6 +51,40 @@ public IndexedCapturingReader([NotNull] Stream stream, int chunkLength = Default
 
             _chunkLength = chunkLength;
             _stream = stream ?? throw new ArgumentNullException(nameof(stream));
+
+            try
+            {
+                // For some reason, FileStreams are faster in contiguous mode. Since this is such a a commont case, we
+                // specifically check for it.
+                if (_stream is FileStream)
+                {
+                    _contiguousBufferMode = true;
+                }
+                else
+                {
+                    // If the stream is both seekable and has a length, switch to non-contiguous buffering mode. This
+                    // will use Seek operations to access data that is far beyond the reach of what has been buffered,
+                    // rather than reading the entire file into memory in this case.
+                    _contiguousBufferMode = !(_stream.Length > 0 && _stream.CanSeek);
+                }
+            }
+            catch (NotSupportedException)
+            {
+                // Streams that don't support the Length property have to be handled in contiguous mode.
+                _contiguousBufferMode = true;
+            }
+
+            if (!_contiguousBufferMode)
+            {
+                // If we know the length of the stream ahead of time, we can allocate a Dictionary with enough slots
+                // for all the chunks. We 2X it to try to avoid hash collisions.
+                var chunksCapacity = 2 * (_stream.Length / chunkLength);
+                _chunks = new Dictionary<int, byte[]>((int) chunksCapacity);
+            }
+            else
+            {
+                _chunks = new Dictionary<int, byte[]>();
+            }
         }
 
         /// <summary>Reads to the end of the stream, in order to determine the total number of bytes.</summary>
@@ -61,21 +95,16 @@ public override long Length
         {
             get
             {
-                if (!_streamLengthThrewException)
+                if (_contiguousBufferMode)
                 {
-                    try
+                    if (_streamLength != -1)
                     {
-                        return _stream.Length;
-                    }
-                    catch (NotSupportedException)
-                    {
-                        _streamLengthThrewException = true;
+                        IsValidIndex(int.MaxValue, 1);
                     }
+                    return _streamLength;
                 }
-                
-                IsValidIndex(int.MaxValue, 1);
-                Debug.Assert(_isStreamFinished);
-                return _streamLength;
+
+                return _stream.Length;
             }
         }
 
@@ -96,12 +125,126 @@ protected override void ValidateIndex(int index, int bytesRequested)
                 if ((long)index + bytesRequested - 1 > int.MaxValue)
                     throw new BufferBoundsException($"Number of requested bytes summed with starting index exceed maximum range of signed 32 bit integers (requested index: {index}, requested count: {bytesRequested})");
 
-                Debug.Assert(_isStreamFinished);
                 // TODO test that can continue using an instance of this type after this exception
                 throw new BufferBoundsException(ToUnshiftedOffset(index), bytesRequested, _streamLength);
             }
         }
 
+        /// <summary>
+        /// Helper method for GetChunk. This will load the next chunk of data from the input stream. If non contiguous
+        /// buffering mode is being used, this method relies on the called (GetChunk) to set the stream's position
+        /// correctly. In contiguous buffer mode, this will simply be the next chunk in sequence (the stream's Position
+        /// field will just advance monotonically).
+        /// </summary>
+        /// <returns></returns>
+        private byte[] LoadNextChunk()
+        {
+            var chunk = new byte[_chunkLength];
+            var totalBytesRead = 0;
+
+            while (totalBytesRead != _chunkLength)
+            {
+                var bytesRead = _stream.Read(chunk, totalBytesRead, _chunkLength - totalBytesRead);
+                totalBytesRead += bytesRead;
+                
+                // if no bytes were read at all, we've reached the end of the file.
+                if (bytesRead == 0 && totalBytesRead == 0)
+                {
+                    return null;
+                }
+                
+                // If this read didn't produce any bytes, but a previous read did, we've hit the end of the file, so
+                // shrink the chunk down to the number of bytes we actually have.
+                if (bytesRead == 0)
+                {
+                    var shrunkChunk = new byte[totalBytesRead];
+                    Buffer.BlockCopy(chunk, 0, shrunkChunk, 0, totalBytesRead);
+                    return shrunkChunk;
+                }
+            }
+
+            return chunk;
+        }
+
+        // GetChunk is substantially slower for random accesses owing to needing to use a Dictionary, rather than a
+        // List. However, the typical access pattern isn't very random at all -- you generally read a whole series of
+        // bytes from the same chunk. So we just cache the last chunk that was read and return that directly if it's
+        // requested again. This is about 15% faster than going straight to the Dictionary.
+        private int _lastChunkIdx = -1;
+        private byte[] _lastChunkData = null;
+
+        /// <summary>
+        /// Load the data for the given chunk (if necessary), and return it. Chunks are identified by their index,
+        /// which is their start offset divided by the chunk length. eg: offset 10 will typically refer to chunk
+        /// index 0. See DoGetChunk() for implementation -- this function adds simple memoization.
+        /// </summary>
+        /// <param name="chunkIndex">The index of the chunk to get</param>
+        private byte[] GetChunk(int chunkIndex)
+        {
+            if (chunkIndex == _lastChunkIdx)
+            {
+                return _lastChunkData;
+            }
+            
+            var result = DoGetChunk(chunkIndex);
+            _lastChunkIdx = chunkIndex;
+            _lastChunkData = result;
+
+            return result;
+        }
+        
+        private byte[] DoGetChunk(int chunkIndex)
+        {
+            byte[] result;
+            if (_chunks.TryGetValue(chunkIndex, out result))
+            {
+                return result;
+            }
+
+            if (!_contiguousBufferMode)
+            {
+                var chunkStart = chunkIndex * _chunkLength;
+                
+                // Often we will be reading long contiguous blocks, even in non-contiguous mode. Don't issue Seeks in
+                // that case, so as to avoid unnecessary syscalls.
+                if (chunkStart != _stream.Position)
+                {
+                    _stream.Seek(chunkStart, SeekOrigin.Begin);
+                }
+                
+                var nextChunk = LoadNextChunk();
+                if (nextChunk != null)
+                {
+                    _chunks[chunkIndex] = nextChunk;
+                    var newStreamLen = (chunkIndex * _chunkLength) + nextChunk.Length;
+                    _streamLength = newStreamLen > _streamLength ? newStreamLen : _streamLength;
+                }
+
+                return nextChunk;
+            }
+
+            byte[] curChunk = null;
+            while (_maxChunkLoaded < chunkIndex)
+            {
+                var curChunkIdx = _maxChunkLoaded + 1;
+                curChunk = LoadNextChunk();
+                if (curChunk != null)
+                {
+                    _chunks[curChunkIdx] = curChunk;
+                    var newStreamLen = (curChunkIdx * _chunkLength) + curChunk.Length;
+                    _streamLength = newStreamLen > _streamLength ? newStreamLen : _streamLength;
+                }
+                else
+                {
+                    return null;
+                }
+
+                _maxChunkLoaded = curChunkIdx;
+            }
+
+            return curChunk;
+        }
+
         protected override bool IsValidIndex(int index, int bytesRequested)
         {
             if (index < 0 || bytesRequested < 0)
@@ -111,45 +254,22 @@ protected override bool IsValidIndex(int index, int bytesRequested)
             if (endIndexLong > int.MaxValue)
                 return false;
 
+            if (!_contiguousBufferMode)
+            {
+                return endIndexLong < _stream.Length;
+            }
+            
             var endIndex = (int)endIndexLong;
-            if (_isStreamFinished)
-                return endIndex < _streamLength;
 
             var chunkIndex = endIndex / _chunkLength;
 
-            // TODO test loading several chunks for a single request
-            while (chunkIndex >= _chunks.Count)
+            var endChunk = GetChunk(chunkIndex);
+            if (endChunk == null)
             {
-                Debug.Assert(!_isStreamFinished);
-
-                var chunk = new byte[_chunkLength];
-                var totalBytesRead = 0;
-                while (!_isStreamFinished && totalBytesRead != _chunkLength)
-                {
-                    var bytesRead = _stream.Read(chunk, totalBytesRead, _chunkLength - totalBytesRead);
-
-                    if (bytesRead == 0)
-                    {
-                        // the stream has ended, which may be ok
-                        _isStreamFinished = true;
-                        _streamLength = _chunks.Count * _chunkLength + totalBytesRead;
-                        // check we have enough bytes for the requested index
-                        if (endIndex >= _streamLength)
-                        {
-                            _chunks.Add(chunk);
-                            return false;
-                        }
-                    }
-                    else
-                    {
-                        totalBytesRead += bytesRead;
-                    }
-                }
-
-                _chunks.Add(chunk);
+                return false;
             }
 
-            return true;
+            return endChunk.Length > (endIndex % _chunkLength);
         }
 
         public override int ToUnshiftedOffset(int localOffset) => localOffset;
@@ -160,7 +280,7 @@ public override byte GetByte(int index)
 
             var chunkIndex = index / _chunkLength;
             var innerIndex = index % _chunkLength;
-            var chunk = _chunks[chunkIndex];
+            var chunk = GetChunk(chunkIndex);
             return chunk[innerIndex];
         }
 
@@ -177,7 +297,7 @@ public override byte[] GetBytes(int index, int count)
                 var fromChunkIndex = fromIndex / _chunkLength;
                 var fromInnerIndex = fromIndex % _chunkLength;
                 var length = Math.Min(remaining, _chunkLength - fromInnerIndex);
-                var chunk = _chunks[fromChunkIndex];
+                var chunk = GetChunk(fromChunkIndex);
                 Array.Copy(chunk, fromInnerIndex, bytes, toIndex, length);
                 remaining -= length;
                 fromIndex += length;

From 6160e1fbd212612d6718a332b8efcdab04e63867 Mon Sep 17 00:00:00 2001
From: James Gregory <james@james.id.au>
Date: Tue, 10 Apr 2018 13:11:37 -0700
Subject: [PATCH 4/5] Add test cases for the streamable IO described in #122.
 This repeats the IndexedCapturingReaderTests, for each of the permutations of
 seekable and length supported.

---
 .../IO/StreamingIndexedCapturingReaderTest.cs | 107 ++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 MetadataExtractor.Tests/IO/StreamingIndexedCapturingReaderTest.cs

diff --git a/MetadataExtractor.Tests/IO/StreamingIndexedCapturingReaderTest.cs b/MetadataExtractor.Tests/IO/StreamingIndexedCapturingReaderTest.cs
new file mode 100644
index 000000000..1178405a2
--- /dev/null
+++ b/MetadataExtractor.Tests/IO/StreamingIndexedCapturingReaderTest.cs
@@ -0,0 +1,107 @@
+#region License
+//
+// Copyright 2002-2017 Drew Noakes
+// Ported from Java to C# by Yakov Danilov for Imazen LLC in 2014
+//
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+//
+//        http://www.apache.org/licenses/LICENSE-2.0
+//
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+//
+// More information about this project is available at:
+//
+//    https://github.com/drewnoakes/metadata-extractor-dotnet
+//    https://drewnoakes.com/code/exif/
+//
+#endregion
+
+using System;
+using System.IO;
+using MetadataExtractor.IO;
+using Xunit;
+
+namespace MetadataExtractor.Tests.IO
+{
+    /// <summary>
+    /// Wrapper for MemoryStream that allows us to configure whether it is
+    /// seekable, and whether or not the Length property is supported.
+    /// </summary>
+    public class ConfigurableMemoryStream : MemoryStream
+    {
+        private bool _seekable;
+        private bool _lengthSupported;
+
+        public ConfigurableMemoryStream(byte[] buffer, bool seekable, bool lengthSupported) : base(buffer)
+        {
+            _seekable = seekable;
+            _lengthSupported = lengthSupported;
+        }
+
+        public override long Length
+        {
+            get
+            {
+                if (_lengthSupported)
+                {
+                    return base.Length;
+                }
+                else
+                {
+                    throw new NotSupportedException("Length property was disabled");
+                }
+            }
+        }
+
+        public override bool CanSeek => _seekable;
+    }
+
+    /// <summary>Unit tests for <see cref="IndexedCapturingReader"/>.</summary>
+    /// <author>Drew Noakes https://drewnoakes.com</author>
+    public abstract class StreamingIndexedCapturingReaderTestBase : IndexedReaderTestBase
+    {
+        [Fact]
+        public void ConstructWithNullBufferThrows()
+        {
+            // ReSharper disable once AssignNullToNotNullAttribute
+            Assert.Throws<ArgumentNullException>(() => new IndexedCapturingReader(null));
+        }
+    }
+
+    // Since the normal IndexedCapturingReaderTest uses MemoryStream, which both
+    // supports the Length property and is seekable, the following classes test
+    // the remaining permutations of options:
+    // 
+    // * non-seekable, has length
+    // * seekable, doesn't have length
+    // * non-seekable, doesn't have length
+    public sealed class NonSeekableLengthSupportedIndexedCapturingReaderTest : StreamingIndexedCapturingReaderTestBase
+    {
+        protected override IndexedReader CreateReader(params byte[] bytes)
+        {
+            return new IndexedCapturingReader(new ConfigurableMemoryStream(bytes, false, true));
+        }
+    }
+
+    public sealed class SeekableLengthUnsupportedIndexedCapturingReaderTest : StreamingIndexedCapturingReaderTestBase
+    {
+        protected override IndexedReader CreateReader(params byte[] bytes)
+        {
+            return new IndexedCapturingReader(new ConfigurableMemoryStream(bytes, true, false));
+        }
+    }
+
+    public sealed class NonSeekableLengthUnsupportedIndexedCapturingReaderTest : StreamingIndexedCapturingReaderTestBase
+    {
+        protected override IndexedReader CreateReader(params byte[] bytes)
+        {
+            return new IndexedCapturingReader(new ConfigurableMemoryStream(bytes, false, false));
+        }
+    }
+}

From 626355f62a894bed8b2dbfa1c228c28bb3ed0371 Mon Sep 17 00:00:00 2001
From: James Gregory <james@james.id.au>
Date: Tue, 10 Apr 2018 13:12:33 -0700
Subject: [PATCH 5/5] Fixes a couple of issues with length handling that were
 revealed by the test cases.

---
 MetadataExtractor/IO/IndexedCapturingReader.cs | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/MetadataExtractor/IO/IndexedCapturingReader.cs b/MetadataExtractor/IO/IndexedCapturingReader.cs
index a284210dd..a48c85b26 100644
--- a/MetadataExtractor/IO/IndexedCapturingReader.cs
+++ b/MetadataExtractor/IO/IndexedCapturingReader.cs
@@ -104,7 +104,7 @@ public override long Length
             {
                 if (_contiguousBufferMode)
                 {
-                    if (_streamLength != -1)
+                    if (_streamLength == -1)
                     {
                         IsValidIndex(int.MaxValue, 1);
                     }
@@ -133,7 +133,7 @@ protected override void ValidateIndex(int index, int bytesRequested)
                     throw new BufferBoundsException($"Number of requested bytes summed with starting index exceed maximum range of signed 32 bit integers (requested index: {index}, requested count: {bytesRequested})");
 
                 // TODO test that can continue using an instance of this type after this exception
-                throw new BufferBoundsException(ToUnshiftedOffset(index), bytesRequested, _streamLength);
+                throw new BufferBoundsException(ToUnshiftedOffset(index), bytesRequested, Length);
             }
         }
 
@@ -238,8 +238,16 @@ private byte[] DoGetChunk(int chunkIndex)
                 if (curChunk != null)
                 {
                     _chunks[curChunkIdx] = curChunk;
-                    var newStreamLen = (curChunkIdx * _chunkLength) + curChunk.Length;
-                    _streamLength = newStreamLen > _streamLength ? newStreamLen : _streamLength;
+                    if(_streamLength < 0)
+                    {
+                        // If this is the first chunk we've loaded, then initialize the stream length.
+                        _streamLength = curChunk.Length;
+                    }
+                    else
+                    {
+                        var newStreamLen = _streamLength + curChunk.Length;
+                        _streamLength = newStreamLen > _streamLength ? newStreamLen : _streamLength;
+                    }
                 }
                 else
                 {