Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
using Lucene.Net.Attributes;
using NUnit.Framework;
using System;
using System.Text;
using Assert = Lucene.Net.TestFramework.Assert;

namespace Lucene.Net.Util
Expand Down Expand Up @@ -339,7 +340,7 @@ public void TestUTF8toUTF16Exception(byte[] invalidUtf8, bool shouldThrow)

if (shouldThrow)
{
Assert.Throws<FormatException>(() => UnicodeUtil.UTF8toUTF16(invalidUtf8, scratch));
Assert.Throws<DecoderFallbackException>(() => UnicodeUtil.UTF8toUTF16(invalidUtf8, scratch));
}
else
{
Expand Down
2 changes: 1 addition & 1 deletion src/Lucene.Net/Util/BytesRef.cs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ namespace Lucene.Net.Util
[Serializable]
#endif
// LUCENENET specific: Not implementing ICloneable per Microsoft's recommendation
[DebuggerDisplay("{ToString()} {Utf8ToString()}")]
[DebuggerDisplay("{ToString()} {Utf8ToStringWithFallback()}")]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What I originally meant by "fallback" was the same logic we use for the formatter:

if (bytesRef.TryUtf8ToString(out var utf8String))
{
return utf8String;
}
else
{
return bytesRef.ToString();
}

That is, when there is invalid UTF-8 in the BytesRef, it would fall back to ToString(), which shows the raw bytes. This would give the developer a clear indication that the bytes were invalid. It also seems like displaying the bytes before the string is reversed. The string is the important bit, so it should go first.

IMO, whatever the solution, there should be a very clear indication that the bytes are invalid. Showing the fallback characters looks a bit strange, but it is hard to tell when developing whether they are actually coming from the code you are debugging, or the debugger itself.

So I propose:

  1. Show the string at the beginning and replace it with the text "Invalid UTF-8" if is invalid.
  2. Show the bytes afterward.
  3. Ideally, show each on separate lines (which requires writing a small custom visualizer).

A custom visualizer for this might look like:

using System;
using System.Diagnostics;
using System.Text;
using Microsoft.VisualStudio.DebuggerVisualizers;

public class MyClassVisualizer : DialogDebuggerVisualizer
{
    protected override void Show(IDialogVisualizerService windowService, IVisualizerObjectProvider objectProvider)
    {
        // Get the object being visualized
        BytesRef bytesRef = objectProvider.GetObject() as BytesRef;
        if (bytesRef is null)
        {
            return;
        }

        // Prepare the display content (multi-line)
        StringBuilder sb = new StringBuilder();
        sb.Append("Content = ");
        if (bytesRef.TryUtf8ToString(out var utf8String))
            sb.AppendLine(utf8String);
        else
            sb.AppendLine("Invalid UTF-8");
        sb.Append("Bytes = ");
        sb.AppendLine(bytesRef .ToString());

        // Show the content in a dialog window
        windowService.ShowDialog(sb.ToString());
    }
}

public sealed class BytesRef : IComparable<BytesRef>, IComparable, IEquatable<BytesRef> // LUCENENET specific - implemented IComparable for FieldComparator, IEquatable<BytesRef>
{
/// <summary>
Expand Down
10 changes: 5 additions & 5 deletions src/Lucene.Net/Util/UnicodeUtil.cs
Original file line number Diff line number Diff line change
Expand Up @@ -886,7 +886,7 @@ public static string ToHexString(string s)
/// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint.
/// <para/>
/// NOTE: Full characters are read, even if this reads past the length passed (and
/// can result in an <see cref="FormatException"/> if invalid UTF-8 is passed).
/// can result in a <see cref="DecoderFallbackException"/> if invalid UTF-8 is passed).
/// Explicit checks for valid UTF-8 are not performed.
/// </summary>
/// <seealso cref="UTF8toUTF16(ReadOnlySpan{byte}, CharsRef)"/>
Expand All @@ -901,7 +901,7 @@ public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef cha
/// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint.
/// <para/>
/// NOTE: Full characters are read, even if this reads past the length passed (and
/// can result in an <see cref="FormatException"/> if invalid UTF-8 is passed).
/// can result in a <see cref="DecoderFallbackException"/> if invalid UTF-8 is passed).
/// Explicit checks for valid UTF-8 are not performed.
/// </summary>
/// <remarks>
Expand All @@ -926,15 +926,15 @@ public static void UTF8toUTF16(ReadOnlySpan<byte> utf8, CharsRef chars)
{
if (utf8.Length <= i)
{
throw new FormatException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}");
throw new DecoderFallbackException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}");
}
@out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[i++] & 0x3f));
}
else if (b < 0xf0)
{
if (utf8.Length <= i + 1)
{
throw new FormatException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}");
throw new DecoderFallbackException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}");
}
@out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[i] & 0x3f) << 6) + (utf8[i + 1] & 0x3f));
i += 2;
Expand All @@ -943,7 +943,7 @@ public static void UTF8toUTF16(ReadOnlySpan<byte> utf8, CharsRef chars)
{
if (utf8.Length <= i + 2)
{
throw new FormatException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}");
throw new DecoderFallbackException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}");
}
if (Debugging.AssertsEnabled) Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b);
int ch = ((b & 0x7) << 18) + ((utf8[i] & 0x3f) << 12) + ((utf8[i + 1] & 0x3f) << 6) + (utf8[i + 2] & 0x3f);
Expand Down
Loading