Lucene.Net.Store (DataInput + DataOutput): Use stack or array pool for temporary byte buffers (#1207)

NightOwl888 · paulirwin · web-flow · commit 36f587de8481 · 2025-11-21T21:09:49.000-07:00
* PERFORMANCE: Lucene.Net.Store.DataInput::ReadString(): Use stack or array pool for buffer reuse

* Lucene.Net.Util.UnicodeUtil: Added overload of TryUTF16toUTF8 that accepts spans for both input and output. Also added a GetMaxByteCount() method to determine the amount of buffer to allocate.

* PERFORMANCE: Lucene.Net.Store.DataOutput::WriteString(): Added WriteChars() method that accepts a span and moved allocations to the stack or array pool for buffer reuse.

* Add unit tests

---------

Co-authored-by: Paul Irwin &lt;paulirwin@gmail.com&gt;
diff --git a/src/Lucene.Net.Tests/Store/TestByteArrayDataOutput.cs b/src/Lucene.Net.Tests/Store/TestByteArrayDataOutput.cs
@@ -0,0 +1,53 @@
+using System;
+using Lucene.Net.Attributes;
+using NUnit.Framework;
+using Assert = Lucene.Net.TestFramework.Assert;
+
+namespace Lucene.Net.Store
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    using LuceneTestCase = Lucene.Net.Util.LuceneTestCase;
+
+    [TestFixture]
+    [LuceneNetSpecific]
+    public class TestByteArrayDataOutput : LuceneTestCase
+    {
+        [Test]
+        public virtual void TestWriteString()
+        {
+            byte[] bytes = new byte[10];
+            ByteArrayDataOutput @out = new ByteArrayDataOutput(bytes);
+            @out.WriteString("ABC");
+
+            ByteArrayDataInput @in = new ByteArrayDataInput(bytes);
+            Assert.AreEqual("ABC", @in.ReadString());
+        }
+
+        [Test]
+        public virtual void TestWriteChars()
+        {
+            byte[] bytes = new byte[10];
+            ByteArrayDataOutput @out = new ByteArrayDataOutput(bytes);
+            @out.WriteChars("ABC".AsSpan());
+
+            ByteArrayDataInput @in = new ByteArrayDataInput(bytes);
+            Assert.AreEqual("ABC", @in.ReadString());
+        }
+    }
+}
diff --git a/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs b/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs
@@ -348,7 +348,7 @@ public void TestUTF8toUTF16Exception(byte[] invalidUtf8, bool shouldThrow)
         }
 
         [Test]
-        [LuceneNetSpecific] // this is a Lucene.NET specific method
+        [LuceneNetSpecific]
         [Repeat(100)]
         public void TestTryUTF8toUTF16()
         {
@@ -362,7 +362,7 @@ public void TestTryUTF8toUTF16()
         }
 
         [Test]
-        [LuceneNetSpecific] // this is a Lucene.NET specific method
+        [LuceneNetSpecific]
         [TestCase(new byte[] { 0x63, 0x61, 0xc3 }, "ca\ufffd")] // ca�, start of 2-byte sequence
         [TestCase(new byte[] { 0x63, 0x61, 0xe3 }, "ca\ufffd")] // ca�, start of 3-byte sequence
         [TestCase(new byte[] { 0x63, 0x61, 0xf3 }, "ca\ufffd")] // ca�, start of 4-byte sequence
@@ -375,5 +375,19 @@ public void TestUTF8toUTF16WithFallback(byte[] utf8, string expected)
 
             Assert.AreEqual(expected, scratch.ToString());
         }
+
+        [Test]
+        [LuceneNetSpecific]
+        [Repeat(100)]
+        public void TestTryUTF16toUTF8()
+        {
+            string unicode = TestUtil.RandomRealisticUnicodeString(Random);
+            var utf8 = new byte[IOUtils.ENCODING_UTF_8_NO_BOM.GetMaxByteCount(unicode.Length)];
+
+            bool success = UnicodeUtil.TryUTF16toUTF8(unicode.AsSpan(), utf8.AsSpan(), out int bytesWritten);
+
+            Assert.IsTrue(success);
+            Assert.AreEqual(unicode, IOUtils.ENCODING_UTF_8_NO_BOM.GetString(utf8, 0, bytesWritten));
+        }
     }
 }
diff --git a/src/Lucene.Net/Store/DataInput.cs b/src/Lucene.Net/Store/DataInput.cs
@@ -1,5 +1,10 @@
+using J2N.Numerics;
+using J2N.Text;
 using Lucene.Net.Diagnostics;
+using Lucene.Net.Support.Buffers;
+using Lucene.Net.Util;
 using System;
+using System.Buffers;
 using System.Collections.Generic;
 using System.IO;
 using System.Text;
@@ -274,16 +279,36 @@ public virtual long ReadVInt64()
             throw new IOException("Invalid VInt64 detected (negative values disallowed)");
         }
 
+#nullable enable
+
         /// <summary>
         /// Reads a <see cref="string"/>. </summary>
         /// <seealso cref="DataOutput.WriteString(string)"/>
         public virtual string ReadString()
         {
+            // LUCENENET: Use stack or array pool for the temporary array so we don't put pressure on the GC
             int length = ReadVInt32();
-            byte[] bytes = new byte[length];
-            ReadBytes(bytes, 0, length);
-
-            return Encoding.UTF8.GetString(bytes);
+            if (length == 0)
+            {
+                // Fast path - don't allocate if we don't need to
+                return string.Empty;
+            }
+            // Always use a power of 2 for the length
+            int bufferLength = BitOperation.IsPow2(length) ? length : BitOperation.RoundUpToPowerOf2(length);
+            byte[]? arrayToReturnToPool = null;
+            Span<byte> bytes = (bufferLength > Constants.MaxStackByteLimit
+                ? (arrayToReturnToPool = ArrayPool<byte>.Shared.Rent(bufferLength))
+                : stackalloc byte[bufferLength])
+                .Slice(0, length); // Slice to exact byte length
+            try
+            {
+                ReadBytes(bytes);
+                return Encoding.UTF8.GetString(bytes);
+            }
+            finally
+            {
+                ArrayPool<byte>.Shared.ReturnIfNotNull(arrayToReturnToPool);
+            }
         }
 
         /// <summary>
diff --git a/src/Lucene.Net/Store/DataOutput.cs b/src/Lucene.Net/Store/DataOutput.cs
@@ -260,12 +260,52 @@ public void WriteVInt64(long i)
         /// <seealso cref="DataInput.ReadString()"/>
         public virtual void WriteString(string s)
         {
-            var utf8Result = new BytesRef(10);
-            UnicodeUtil.UTF16toUTF8(s, 0, s.Length, utf8Result);
-            WriteVInt32(utf8Result.Length);
-            WriteBytes(utf8Result.Bytes, 0, utf8Result.Length);
+            if (s is null)
+                throw new ArgumentNullException(nameof(s));
+            WriteChars(s.AsSpan());
         }
 
+#nullable enable
+
+        /// <summary>
+        /// Writes a character span.
+        /// <para/>
+        /// Writes chars as UTF-8 encoded bytes. First the length, in bytes, is
+        /// written as a <see cref="WriteVInt32"/>, followed by the bytes.
+        /// </summary>
+        /// <param name="chars">The chars to write.</param>
+        // LUCENENET specific
+        public virtual void WriteChars(ReadOnlySpan<char> chars)
+        {
+            if (chars.IsEmpty)
+            {
+                // Fast path - don't allocate if we don't need to
+                WriteVInt32(0);
+                WriteBytes(Array.Empty<byte>());
+                return;
+            }
+
+            int bufferLength = UnicodeUtil.GetMaxByteCount(chars.Length);
+            byte[]? arrayToReturnToPool = null;
+            Span<byte> utf8Result = bufferLength > Constants.MaxStackByteLimit
+                ? (arrayToReturnToPool = ArrayPool<byte>.Shared.Rent(bufferLength))
+                : stackalloc byte[bufferLength];
+            try
+            {
+                // We are calculating the size up front, so this will always succeed.
+                bool success = UnicodeUtil.TryUTF16toUTF8(chars, utf8Result, out int bytesLength);
+                Debug.Assert(success, "There wasn't enough memory allocated for all of the bytes.");
+                WriteVInt32(bytesLength);
+                WriteBytes(utf8Result.Slice(0, bytesLength));
+            }
+            finally
+            {
+                ArrayPool<byte>.Shared.ReturnIfNotNull(arrayToReturnToPool);
+            }
+        }
+
+#nullable restore
+
         private const int COPY_BUFFER_SIZE = 16384;
         private byte[] copyBuffer;
 
diff --git a/src/Lucene.Net/Util/UnicodeUtil.cs b/src/Lucene.Net/Util/UnicodeUtil.cs
@@ -127,6 +127,84 @@ public static partial class UnicodeUtil
         private const int SURROGATE_OFFSET = Character.MinSupplementaryCodePoint -
                                              (UNI_SUR_HIGH_START << (int)HALF_SHIFT) - UNI_SUR_LOW_START;
 
+        /// <summary>
+        /// Encodes into a span of bytes a set of characters from the specified read-only span if the
+        /// <paramref name="destination"/> is large enough.
+        /// </summary>
+        /// <param name="source">The span containing the set of characters to encode.</param>
+        /// <param name="destination">The byte span to hold the encoded bytes.</param>
+        /// <param name="bytesWritten">Upon successful completion of the operation, the number of bytes
+        /// encoded into <paramref name="destination"/>.</param>
+        /// <returns><c>true</c> if all of the characters were encoded into the destination;
+        /// <c>false</c> if the destination was too small to contain all the encoded bytes.</returns>
+        /// <remarks>To estimate the number of bytes to allocate, use <see cref="GetMaxByteCount(int)"/>.
+        /// <para/>
+        /// This is similar to <c>Encoding.UTF8.TryGetBytes()</c>.</remarks>
+        // LUCENENET specific overload
+        public static bool TryUTF16toUTF8(ReadOnlySpan<char> source, Span<byte> destination, out int bytesWritten)
+        {
+            bytesWritten = 0;
+            int offset = 0;
+            int length = source.Length;
+            int end = offset + length;
+
+            int upto = 0;
+            for (int i = offset; i < end; i++)
+            {
+                var code = (int)source[i];
+
+                if (code < 0x80)
+                {
+                    if (upto + 1 >= destination.Length) return false;
+                    destination[upto++] = (byte)code;
+                }
+                else if (code < 0x800)
+                {
+                    if (upto + 2 >= destination.Length) return false;
+                    destination[upto++] = (byte)(0xC0 | (code >> 6));
+                    destination[upto++] = (byte)(0x80 | (code & 0x3F));
+                }
+                else if (code < 0xD800 || code > 0xDFFF)
+                {
+                    if (upto + 3 >= destination.Length) return false;
+                    destination[upto++] = (byte)(0xE0 | (code >> 12));
+                    destination[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
+                    destination[upto++] = (byte)(0x80 | (code & 0x3F));
+                }
+                else
+                {
+                    // surrogate pair
+                    // confirm valid high surrogate
+                    if (code < 0xDC00 && (i < end - 1))
+                    {
+                        var utf32 = (int)source[i + 1];
+                        // confirm valid low surrogate and write pair
+                        if (utf32 >= 0xDC00 && utf32 <= 0xDFFF)
+                        {
+                            utf32 = (code << 10) + utf32 + SURROGATE_OFFSET;
+                            i++;
+                            if (upto + 4 >= destination.Length) return false;
+                            destination[upto++] = (byte)(0xF0 | (utf32 >> 18));
+                            destination[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
+                            destination[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
+                            destination[upto++] = (byte)(0x80 | (utf32 & 0x3F));
+                            continue;
+                        }
+                    }
+
+                    // replace unpaired surrogate or out-of-order low surrogate
+                    // with substitution character
+                    if (upto + 3 >= destination.Length) return false;
+                    destination[upto++] = 0xEF;
+                    destination[upto++] = 0xBF;
+                    destination[upto++] = 0xBD;
+                }
+            }
+
+            bytesWritten = upto;
+            return true;
+        }
+
         /// <summary>
         /// Encode characters from a <see cref="ReadOnlySpan{T}"/> (with generic type argument <see cref="char"/>) <paramref name="source"/>, starting at
         /// and ending at <paramref name="result"/>. After encoding, <c>result.Offset</c> will always be 0.
@@ -620,6 +698,17 @@ public static bool ValidUTF16String(ReadOnlySpan<char> s, int size)
             return true;
         }
 
+        /// <summary>
+        /// Calculates the maximum number of bytes produced by UTF8 encoding the
+        /// specified number of characters.
+        /// </summary>
+        /// <param name="charCount">The number of characters to encode.</param>
+        /// <returns>The maximum number of bytes produced by encoding the specified
+        /// number of characters to UTF8.</returns>
+        /// <remarks>The return value will always be a power of 2.</remarks>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static int GetMaxByteCount(int charCount) => charCount * 4;
+
         // Borrowed from Python's 3.1.2 sources,
         // Objects/unicodeobject.c, and modified (see commented
         // out section, and the -1s) to disallow the reserved for

Original file line number	Diff line number	Diff line change
`@@ -348,7 +348,7 @@ public void TestUTF8toUTF16Exception(byte[] invalidUtf8, bool shouldThrow)`
`348`	`348`	`}`
`349`	`349`
`350`	`350`	`[Test]`
`351`		`- [LuceneNetSpecific] // this is a Lucene.NET specific method`
	`351`	`+ [LuceneNetSpecific]`
`352`	`352`	`[Repeat(100)]`
`353`	`353`	`public void TestTryUTF8toUTF16()`
`354`	`354`	`{`
`@@ -362,7 +362,7 @@ public void TestTryUTF8toUTF16()`
`362`	`362`	`}`
`363`	`363`
`364`	`364`	`[Test]`
`365`		`- [LuceneNetSpecific] // this is a Lucene.NET specific method`
	`365`	`+ [LuceneNetSpecific]`
`366`	`366`	`[TestCase(new byte[] { 0x63, 0x61, 0xc3 }, "ca\ufffd")] // ca�, start of 2-byte sequence`
`367`	`367`	`[TestCase(new byte[] { 0x63, 0x61, 0xe3 }, "ca\ufffd")] // ca�, start of 3-byte sequence`
`368`	`368`	`[TestCase(new byte[] { 0x63, 0x61, 0xf3 }, "ca\ufffd")] // ca�, start of 4-byte sequence`
`@@ -375,5 +375,19 @@ public void TestUTF8toUTF16WithFallback(byte[] utf8, string expected)`
`375`	`375`
`376`	`376`	`Assert.AreEqual(expected, scratch.ToString());`
`377`	`377`	`}`
	`378`	`+`
	`379`	`+ [Test]`
	`380`	`+ [LuceneNetSpecific]`
	`381`	`+ [Repeat(100)]`
	`382`	`+ public void TestTryUTF16toUTF8()`
	`383`	`+ {`
	`384`	`+ string unicode = TestUtil.RandomRealisticUnicodeString(Random);`
	`385`	`+ var utf8 = new byte[IOUtils.ENCODING_UTF_8_NO_BOM.GetMaxByteCount(unicode.Length)];`
	`386`	`+`
	`387`	`+ bool success = UnicodeUtil.TryUTF16toUTF8(unicode.AsSpan(), utf8.AsSpan(), out int bytesWritten);`
	`388`	`+`
	`389`	`+ Assert.IsTrue(success);`
	`390`	`+ Assert.AreEqual(unicode, IOUtils.ENCODING_UTF_8_NO_BOM.GetString(utf8, 0, bytesWritten));`
	`391`	`+ }`
`378`	`392`	`}`
`379`	`393`	`}`