Skip to content

Commit 36f587d

Browse files
Lucene.Net.Store (DataInput + DataOutput): Use stack or array pool for temporary byte buffers (#1207)
* PERFORMANCE: Lucene.Net.Store.DataInput::ReadString(): Use stack or array pool for buffer reuse * Lucene.Net.Util.UnicodeUtil: Added overload of TryUTF16toUTF8 that accepts spans for both input and output. Also added a GetMaxByteCount() method to determine the amount of buffer to allocate. * PERFORMANCE: Lucene.Net.Store.DataOutput::WriteString(): Added WriteChars() method that accepts a span and moved allocations to the stack or array pool for buffer reuse. * Add unit tests --------- Co-authored-by: Paul Irwin <[email protected]>
1 parent c711b30 commit 36f587d

File tree

5 files changed

+231
-10
lines changed

5 files changed

+231
-10
lines changed
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
using System;
2+
using Lucene.Net.Attributes;
3+
using NUnit.Framework;
4+
using Assert = Lucene.Net.TestFramework.Assert;
5+
6+
namespace Lucene.Net.Store
7+
{
8+
/*
9+
* Licensed to the Apache Software Foundation (ASF) under one or more
10+
* contributor license agreements. See the NOTICE file distributed with
11+
* this work for additional information regarding copyright ownership.
12+
* The ASF licenses this file to You under the Apache License, Version 2.0
13+
* (the "License"); you may not use this file except in compliance with
14+
* the License. You may obtain a copy of the License at
15+
*
16+
* http://www.apache.org/licenses/LICENSE-2.0
17+
*
18+
* Unless required by applicable law or agreed to in writing, software
19+
* distributed under the License is distributed on an "AS IS" BASIS,
20+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21+
* See the License for the specific language governing permissions and
22+
* limitations under the License.
23+
*/
24+
25+
using LuceneTestCase = Lucene.Net.Util.LuceneTestCase;
26+
27+
[TestFixture]
28+
[LuceneNetSpecific]
29+
public class TestByteArrayDataOutput : LuceneTestCase
30+
{
31+
[Test]
32+
public virtual void TestWriteString()
33+
{
34+
byte[] bytes = new byte[10];
35+
ByteArrayDataOutput @out = new ByteArrayDataOutput(bytes);
36+
@out.WriteString("ABC");
37+
38+
ByteArrayDataInput @in = new ByteArrayDataInput(bytes);
39+
Assert.AreEqual("ABC", @in.ReadString());
40+
}
41+
42+
[Test]
43+
public virtual void TestWriteChars()
44+
{
45+
byte[] bytes = new byte[10];
46+
ByteArrayDataOutput @out = new ByteArrayDataOutput(bytes);
47+
@out.WriteChars("ABC".AsSpan());
48+
49+
ByteArrayDataInput @in = new ByteArrayDataInput(bytes);
50+
Assert.AreEqual("ABC", @in.ReadString());
51+
}
52+
}
53+
}

src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -348,7 +348,7 @@ public void TestUTF8toUTF16Exception(byte[] invalidUtf8, bool shouldThrow)
348348
}
349349

350350
[Test]
351-
[LuceneNetSpecific] // this is a Lucene.NET specific method
351+
[LuceneNetSpecific]
352352
[Repeat(100)]
353353
public void TestTryUTF8toUTF16()
354354
{
@@ -362,7 +362,7 @@ public void TestTryUTF8toUTF16()
362362
}
363363

364364
[Test]
365-
[LuceneNetSpecific] // this is a Lucene.NET specific method
365+
[LuceneNetSpecific]
366366
[TestCase(new byte[] { 0x63, 0x61, 0xc3 }, "ca\ufffd")] // ca�, start of 2-byte sequence
367367
[TestCase(new byte[] { 0x63, 0x61, 0xe3 }, "ca\ufffd")] // ca�, start of 3-byte sequence
368368
[TestCase(new byte[] { 0x63, 0x61, 0xf3 }, "ca\ufffd")] // ca�, start of 4-byte sequence
@@ -375,5 +375,19 @@ public void TestUTF8toUTF16WithFallback(byte[] utf8, string expected)
375375

376376
Assert.AreEqual(expected, scratch.ToString());
377377
}
378+
379+
[Test]
380+
[LuceneNetSpecific]
381+
[Repeat(100)]
382+
public void TestTryUTF16toUTF8()
383+
{
384+
string unicode = TestUtil.RandomRealisticUnicodeString(Random);
385+
var utf8 = new byte[IOUtils.ENCODING_UTF_8_NO_BOM.GetMaxByteCount(unicode.Length)];
386+
387+
bool success = UnicodeUtil.TryUTF16toUTF8(unicode.AsSpan(), utf8.AsSpan(), out int bytesWritten);
388+
389+
Assert.IsTrue(success);
390+
Assert.AreEqual(unicode, IOUtils.ENCODING_UTF_8_NO_BOM.GetString(utf8, 0, bytesWritten));
391+
}
378392
}
379393
}

src/Lucene.Net/Store/DataInput.cs

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
1+
using J2N.Numerics;
2+
using J2N.Text;
13
using Lucene.Net.Diagnostics;
4+
using Lucene.Net.Support.Buffers;
5+
using Lucene.Net.Util;
26
using System;
7+
using System.Buffers;
38
using System.Collections.Generic;
49
using System.IO;
510
using System.Text;
@@ -274,16 +279,36 @@ public virtual long ReadVInt64()
274279
throw new IOException("Invalid VInt64 detected (negative values disallowed)");
275280
}
276281

282+
#nullable enable
283+
277284
/// <summary>
278285
/// Reads a <see cref="string"/>. </summary>
279286
/// <seealso cref="DataOutput.WriteString(string)"/>
280287
public virtual string ReadString()
281288
{
289+
// LUCENENET: Use stack or array pool for the temporary array so we don't put pressure on the GC
282290
int length = ReadVInt32();
283-
byte[] bytes = new byte[length];
284-
ReadBytes(bytes, 0, length);
285-
286-
return Encoding.UTF8.GetString(bytes);
291+
if (length == 0)
292+
{
293+
// Fast path - don't allocate if we don't need to
294+
return string.Empty;
295+
}
296+
// Always use a power of 2 for the length
297+
int bufferLength = BitOperation.IsPow2(length) ? length : BitOperation.RoundUpToPowerOf2(length);
298+
byte[]? arrayToReturnToPool = null;
299+
Span<byte> bytes = (bufferLength > Constants.MaxStackByteLimit
300+
? (arrayToReturnToPool = ArrayPool<byte>.Shared.Rent(bufferLength))
301+
: stackalloc byte[bufferLength])
302+
.Slice(0, length); // Slice to exact byte length
303+
try
304+
{
305+
ReadBytes(bytes);
306+
return Encoding.UTF8.GetString(bytes);
307+
}
308+
finally
309+
{
310+
ArrayPool<byte>.Shared.ReturnIfNotNull(arrayToReturnToPool);
311+
}
287312
}
288313

289314
/// <summary>

src/Lucene.Net/Store/DataOutput.cs

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -260,12 +260,52 @@ public void WriteVInt64(long i)
260260
/// <seealso cref="DataInput.ReadString()"/>
261261
public virtual void WriteString(string s)
262262
{
263-
var utf8Result = new BytesRef(10);
264-
UnicodeUtil.UTF16toUTF8(s, 0, s.Length, utf8Result);
265-
WriteVInt32(utf8Result.Length);
266-
WriteBytes(utf8Result.Bytes, 0, utf8Result.Length);
263+
if (s is null)
264+
throw new ArgumentNullException(nameof(s));
265+
WriteChars(s.AsSpan());
267266
}
268267

268+
#nullable enable
269+
270+
/// <summary>
271+
/// Writes a character span.
272+
/// <para/>
273+
/// Writes chars as UTF-8 encoded bytes. First the length, in bytes, is
274+
/// written as a <see cref="WriteVInt32"/>, followed by the bytes.
275+
/// </summary>
276+
/// <param name="chars">The chars to write.</param>
277+
// LUCENENET specific
278+
public virtual void WriteChars(ReadOnlySpan<char> chars)
279+
{
280+
if (chars.IsEmpty)
281+
{
282+
// Fast path - don't allocate if we don't need to
283+
WriteVInt32(0);
284+
WriteBytes(Array.Empty<byte>());
285+
return;
286+
}
287+
288+
int bufferLength = UnicodeUtil.GetMaxByteCount(chars.Length);
289+
byte[]? arrayToReturnToPool = null;
290+
Span<byte> utf8Result = bufferLength > Constants.MaxStackByteLimit
291+
? (arrayToReturnToPool = ArrayPool<byte>.Shared.Rent(bufferLength))
292+
: stackalloc byte[bufferLength];
293+
try
294+
{
295+
// We are calculating the size up front, so this will always succeed.
296+
bool success = UnicodeUtil.TryUTF16toUTF8(chars, utf8Result, out int bytesLength);
297+
Debug.Assert(success, "There wasn't enough memory allocated for all of the bytes.");
298+
WriteVInt32(bytesLength);
299+
WriteBytes(utf8Result.Slice(0, bytesLength));
300+
}
301+
finally
302+
{
303+
ArrayPool<byte>.Shared.ReturnIfNotNull(arrayToReturnToPool);
304+
}
305+
}
306+
307+
#nullable restore
308+
269309
private const int COPY_BUFFER_SIZE = 16384;
270310
private byte[] copyBuffer;
271311

src/Lucene.Net/Util/UnicodeUtil.cs

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,84 @@ public static partial class UnicodeUtil
127127
private const int SURROGATE_OFFSET = Character.MinSupplementaryCodePoint -
128128
(UNI_SUR_HIGH_START << (int)HALF_SHIFT) - UNI_SUR_LOW_START;
129129

130+
/// <summary>
131+
/// Encodes into a span of bytes a set of characters from the specified read-only span if the
132+
/// <paramref name="destination"/> is large enough.
133+
/// </summary>
134+
/// <param name="source">The span containing the set of characters to encode.</param>
135+
/// <param name="destination">The byte span to hold the encoded bytes.</param>
136+
/// <param name="bytesWritten">Upon successful completion of the operation, the number of bytes
137+
/// encoded into <paramref name="destination"/>.</param>
138+
/// <returns><c>true</c> if all of the characters were encoded into the destination;
139+
/// <c>false</c> if the destination was too small to contain all the encoded bytes.</returns>
140+
/// <remarks>To estimate the number of bytes to allocate, use <see cref="GetMaxByteCount(int)"/>.
141+
/// <para/>
142+
/// This is similar to <c>Encoding.UTF8.TryGetBytes()</c>.</remarks>
143+
// LUCENENET specific overload
144+
public static bool TryUTF16toUTF8(ReadOnlySpan<char> source, Span<byte> destination, out int bytesWritten)
145+
{
146+
bytesWritten = 0;
147+
int offset = 0;
148+
int length = source.Length;
149+
int end = offset + length;
150+
151+
int upto = 0;
152+
for (int i = offset; i < end; i++)
153+
{
154+
var code = (int)source[i];
155+
156+
if (code < 0x80)
157+
{
158+
if (upto + 1 >= destination.Length) return false;
159+
destination[upto++] = (byte)code;
160+
}
161+
else if (code < 0x800)
162+
{
163+
if (upto + 2 >= destination.Length) return false;
164+
destination[upto++] = (byte)(0xC0 | (code >> 6));
165+
destination[upto++] = (byte)(0x80 | (code & 0x3F));
166+
}
167+
else if (code < 0xD800 || code > 0xDFFF)
168+
{
169+
if (upto + 3 >= destination.Length) return false;
170+
destination[upto++] = (byte)(0xE0 | (code >> 12));
171+
destination[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
172+
destination[upto++] = (byte)(0x80 | (code & 0x3F));
173+
}
174+
else
175+
{
176+
// surrogate pair
177+
// confirm valid high surrogate
178+
if (code < 0xDC00 && (i < end - 1))
179+
{
180+
var utf32 = (int)source[i + 1];
181+
// confirm valid low surrogate and write pair
182+
if (utf32 >= 0xDC00 && utf32 <= 0xDFFF)
183+
{
184+
utf32 = (code << 10) + utf32 + SURROGATE_OFFSET;
185+
i++;
186+
if (upto + 4 >= destination.Length) return false;
187+
destination[upto++] = (byte)(0xF0 | (utf32 >> 18));
188+
destination[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
189+
destination[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
190+
destination[upto++] = (byte)(0x80 | (utf32 & 0x3F));
191+
continue;
192+
}
193+
}
194+
195+
// replace unpaired surrogate or out-of-order low surrogate
196+
// with substitution character
197+
if (upto + 3 >= destination.Length) return false;
198+
destination[upto++] = 0xEF;
199+
destination[upto++] = 0xBF;
200+
destination[upto++] = 0xBD;
201+
}
202+
}
203+
204+
bytesWritten = upto;
205+
return true;
206+
}
207+
130208
/// <summary>
131209
/// Encode characters from a <see cref="ReadOnlySpan{T}"/> (with generic type argument <see cref="char"/>) <paramref name="source"/>, starting at
132210
/// and ending at <paramref name="result"/>. After encoding, <c>result.Offset</c> will always be 0.
@@ -620,6 +698,17 @@ public static bool ValidUTF16String(ReadOnlySpan<char> s, int size)
620698
return true;
621699
}
622700

701+
/// <summary>
702+
/// Calculates the maximum number of bytes produced by UTF8 encoding the
703+
/// specified number of characters.
704+
/// </summary>
705+
/// <param name="charCount">The number of characters to encode.</param>
706+
/// <returns>The maximum number of bytes produced by encoding the specified
707+
/// number of characters to UTF8.</returns>
708+
/// <remarks>The return value will always be a power of 2.</remarks>
709+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
710+
public static int GetMaxByteCount(int charCount) => charCount * 4;
711+
623712
// Borrowed from Python's 3.1.2 sources,
624713
// Objects/unicodeobject.c, and modified (see commented
625714
// out section, and the -1s) to disallow the reserved for

0 commit comments

Comments
 (0)