From 0681bcb8024cb696e30c807070d734fdda7decc2 Mon Sep 17 00:00:00 2001 From: yash-puligundla Date: Mon, 27 Sep 2021 15:54:51 -0400 Subject: [PATCH 1/3] handle utf-8 encoding --- .../htsjdk/samtools/util/AsciiWriter.java | 18 ++-- .../htsjdk/samtools/SAMFileRoundTripTest.java | 94 +++++++++++++++++++ .../resources/htsjdk/samtools/roundtrip.sam | 4 +- .../htsjdk/samtools/roundtrip_with_utf8.sam | 18 ++++ .../samtools/roundtrip_with_utf8_bad_1.sam | 18 ++++ .../samtools/roundtrip_with_utf8_bad_2.sam | 18 ++++ .../samtools/roundtrip_with_utf8_bad_3.sam | 18 ++++ 7 files changed, 175 insertions(+), 13 deletions(-) create mode 100644 src/test/java/htsjdk/samtools/SAMFileRoundTripTest.java create mode 100644 src/test/resources/htsjdk/samtools/roundtrip_with_utf8.sam create mode 100644 src/test/resources/htsjdk/samtools/roundtrip_with_utf8_bad_1.sam create mode 100644 src/test/resources/htsjdk/samtools/roundtrip_with_utf8_bad_2.sam create mode 100644 src/test/resources/htsjdk/samtools/roundtrip_with_utf8_bad_3.sam diff --git a/src/main/java/htsjdk/samtools/util/AsciiWriter.java b/src/main/java/htsjdk/samtools/util/AsciiWriter.java index 50b08d8443..fe3fda7aac 100644 --- a/src/main/java/htsjdk/samtools/util/AsciiWriter.java +++ b/src/main/java/htsjdk/samtools/util/AsciiWriter.java @@ -28,6 +28,7 @@ import java.io.IOException; import java.io.OutputStream; import java.io.Writer; +import java.nio.charset.*; /** * Fast (I hope) buffered Writer that converts char to byte merely by casting, rather than charset conversion. @@ -70,17 +71,10 @@ public void flush() throws IOException { * All other Writer methods vector through this, so this is the only one that must be overridden. */ @Override - public void write(final char[] chars, int offset, int length) throws IOException { - while (length > 0) { - final int charsToConvert = Math.min(length, buffer.length - numBytes); - StringUtil.charsToBytes(chars, offset, charsToConvert, buffer, numBytes); - numBytes += charsToConvert; - offset += charsToConvert; - length -= charsToConvert; - if (numBytes == buffer.length) { - os.write(buffer, 0, numBytes); - numBytes = 0; - } - } + public void write(final char[] chars, int off, int len) throws IOException { + String str = new String(chars,off,len); + byte[] b = str.getBytes(StandardCharsets.UTF_8); + int bufferLength = b.length; + os.write(b, 0, bufferLength); } } diff --git a/src/test/java/htsjdk/samtools/SAMFileRoundTripTest.java b/src/test/java/htsjdk/samtools/SAMFileRoundTripTest.java new file mode 100644 index 0000000000..f70f377bfc --- /dev/null +++ b/src/test/java/htsjdk/samtools/SAMFileRoundTripTest.java @@ -0,0 +1,94 @@ +package htsjdk.samtools; + +import htsjdk.*; +import htsjdk.samtools.util.*; +import org.testng.*; +import org.testng.annotations.*; +import java.io.*; + +public class SAMFileRoundTripTest extends HtsjdkTest{ + private static final File TEST_DATA_DIR = new File("src/test/resources/htsjdk/samtools"); + + @DataProvider(name = "Utf8PositiveTestCases") + public Object[][] Utf8PositiveTestCases() { + SAMProgramRecord programRecordRoundTrip = new SAMProgramRecord("33"); + programRecordRoundTrip.setAttribute("CL","xy"); + programRecordRoundTrip.setAttribute("DS","description"); + + SAMProgramRecord programRecordRoundTripUtf8 = new SAMProgramRecord("33"); + programRecordRoundTripUtf8.setAttribute("CL","äカ"); + programRecordRoundTripUtf8.setAttribute("DS","\uD83D\uDE00リ"); + + SAMSequenceRecord sequenceRecordRoundTrip = new SAMSequenceRecord("chr3",101); + sequenceRecordRoundTrip.setAttribute("DS","descriptionhere"); + sequenceRecordRoundTrip.setSequenceIndex(2); + + SAMSequenceRecord sequenceRecordRoundTripUtf8 = new SAMSequenceRecord("chr3",101); + sequenceRecordRoundTripUtf8.setAttribute("DS","Emoji\uD83D\uDE0A"); + sequenceRecordRoundTripUtf8.setSequenceIndex(2); + + return new Object[][]{ + {"roundtrip.sam", programRecordRoundTrip, "@CO\tcomment here", sequenceRecordRoundTrip}, + {"roundtrip_with_utf8.sam", programRecordRoundTripUtf8, "@CO\tKanjiアメリカ\uD83D\uDE00リä", sequenceRecordRoundTripUtf8} + }; + } + + @Test(dataProvider = "Utf8PositiveTestCases", description = "Test UTF-8 encoding present in permitted fields of a SAM file") + public void Utf8RoundTripPositiveTests(final String inputFile, SAMProgramRecord programRecord,final String commentRecord, SAMSequenceRecord sequenceRecord) throws Exception { + final File input = new File(TEST_DATA_DIR, inputFile); + final File outputFile = File.createTempFile("roundtrip-utf8-out", ".sam"); + outputFile.delete(); + outputFile.deleteOnExit(); + final SAMFileWriterFactory factory = new SAMFileWriterFactory(); + try (SamReader reader = SamReaderFactory.makeDefault().open(input); + SAMFileWriter writer = factory.makeSAMWriter(reader.getFileHeader(), false, new FileOutputStream(outputFile))) { + for (SAMRecord rec : reader) { + writer.addAlignment(rec); + } + SAMFileHeader head = reader.getFileHeader(); + Assert.assertEquals(head.getProgramRecords().get(0), programRecord); + Assert.assertEquals(head.getComments().get(0),commentRecord ); + Assert.assertEquals(head.getSequence("chr3"),sequenceRecord); + } + + final String originalsam; + try (InputStream is = new FileInputStream(input)) { + originalsam = IOUtil.readFully(is); + } + + final String writtenSam; + try (InputStream is = new FileInputStream(outputFile)) { + writtenSam = IOUtil.readFully(is); + } + + Assert.assertEquals(writtenSam, originalsam); + } + + @DataProvider(name = "Utf8NegativeTestCases") + public Object[][] Utf8NegativeTestCases() { + return new Object[][]{ + {"roundtrip_with_utf8_bad_1.sam", "Invalid character in read bases"}, + {"roundtrip_with_utf8_bad_2.sam", "Non-numeric value in POS column"}, + {"roundtrip_with_utf8_bad_2.sam", "Non-numeric value in POS column"} + }; + } + + @Test(dataProvider = "Utf8NegativeTestCases",description = "Test UTF-8 encoding present in unpermitted fields of a SAM file", expectedExceptions = {IllegalArgumentException.class, SAMFormatException.class }) + public void Utf8RoundTripNegativeTest(final String inputFile,final String exceptionString) throws Exception { + final File input = new File(TEST_DATA_DIR, inputFile); + final File outputFile = File.createTempFile("roundtrip-utf8-out", ".sam"); + outputFile.delete(); + outputFile.deleteOnExit(); + final SAMFileWriterFactory factory = new SAMFileWriterFactory(); + try (SamReader reader = SamReaderFactory.makeDefault().open(input); + SAMFileWriter writer = factory.makeSAMWriter(reader.getFileHeader(), false, new FileOutputStream(outputFile))) { + for (SAMRecord rec : reader) { + writer.addAlignment(rec); + } + } + catch (final Exception ex) { + Assert.assertTrue(ex.getMessage().contains(exceptionString)); + throw ex; + } + } +} \ No newline at end of file diff --git a/src/test/resources/htsjdk/samtools/roundtrip.sam b/src/test/resources/htsjdk/samtools/roundtrip.sam index bc90a50f23..1d31b4db25 100644 --- a/src/test/resources/htsjdk/samtools/roundtrip.sam +++ b/src/test/resources/htsjdk/samtools/roundtrip.sam @@ -1,9 +1,11 @@ @HD VN:1.6 SO:unsorted @SQ SN:chr1 LN:101 @SQ SN:chr2 LN:101 -@SQ SN:chr3 LN:101 +@SQ SN:chr3 LN:101 DS:descriptionhere @RG ID:0 SM:Hi,Mom! @RG ID:rg1 PL:ILLUMINA SM:sm1 +@PG ID:33 CL:xy DS:description +@CO comment here A 73 chr2 1 255 10M * 0 0 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 A 133 * 0 0 * chr2 1 0 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 B 99 chr1 1 255 10M = 26 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 diff --git a/src/test/resources/htsjdk/samtools/roundtrip_with_utf8.sam b/src/test/resources/htsjdk/samtools/roundtrip_with_utf8.sam new file mode 100644 index 0000000000..268aff0fb1 --- /dev/null +++ b/src/test/resources/htsjdk/samtools/roundtrip_with_utf8.sam @@ -0,0 +1,18 @@ +@HD VN:1.6 SO:unsorted +@SQ SN:chr1 LN:101 +@SQ SN:chr2 LN:101 +@SQ SN:chr3 LN:101 DS:Emoji😊 +@RG ID:0 SM:Hi,Mom! DS:Kanjiアメリカ +@RG ID:rg1 PL:ILLUMINA SM:sm1 +@PG ID:33 CL:äカ DS:😀リ +@CO Kanjiアメリカ😀リä +A 73 chr2 1 255 10M * 0 0 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +A 133 * 0 0 * chr2 1 0 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +B 99 chr1 1 255 10M = 26 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +B 147 chr1 26 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +C 99 chr2 1 255 10M = 26 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +C 147 chr2 26 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +D 99 chr3 1 255 10M = 25 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +D 147 chr3 26 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +E 99 chr1 2 255 10M = 15 30 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +E 147 chr1 15 255 10M = 2 -30 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 \ No newline at end of file diff --git a/src/test/resources/htsjdk/samtools/roundtrip_with_utf8_bad_1.sam b/src/test/resources/htsjdk/samtools/roundtrip_with_utf8_bad_1.sam new file mode 100644 index 0000000000..70b52a9013 --- /dev/null +++ b/src/test/resources/htsjdk/samtools/roundtrip_with_utf8_bad_1.sam @@ -0,0 +1,18 @@ +@HD VN:1.6 SO:unsorted +@SQ SN:chr1 LN:101 +@SQ SN:chr2 LN:101 +@SQ SN:chr3 LN:101 DS:Emoji😊 +@RG ID:0 SM:Hi,Mom! DS:Kanjiアメリカ +@RG ID:rg1 PL:ILLUMINA SM:sm1 +@PG ID:33 CL:äカ DS:😀リ +@CO Kanjiアメリカ😀リä +A 73 chr2 1 255 10M * 0 0 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +A 133 * 0 0 * chr2 1 0 CAリCAGAAGC )'.*.+2,)) RG:Z:rg1 +B 99 chr1 1 255 10M = 26 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +B 147 chr1 26 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +C 99 chr2 1 255 10M = 26 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +C 147 chr1 😀6 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +D 99 chr3 1 255 10M = 25 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +D 147 chr3 26 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +E 99 chr1 2 255 10M = 15 30 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +E 147 chr1 15 255 10M = 2 -30 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 diff --git a/src/test/resources/htsjdk/samtools/roundtrip_with_utf8_bad_2.sam b/src/test/resources/htsjdk/samtools/roundtrip_with_utf8_bad_2.sam new file mode 100644 index 0000000000..b75402fb23 --- /dev/null +++ b/src/test/resources/htsjdk/samtools/roundtrip_with_utf8_bad_2.sam @@ -0,0 +1,18 @@ +@HD VN:1.6 SO:unsorted +@SQ SN:chr1 LN:101 +@SQ SN:chr2 LN:101 +@SQ SN:chr3 LN:101 DS:Emoji😊 +@RG ID:0 SM:Hi,Mom! DS:Kanjiアメリカ +@RG ID:rg1 PL:ILLUMINA SM:sm1 +@PG ID:33 CL:äカ DS:😀リ +@CO Kanjiアメリカ😀リä +A 73 chr2 1 255 10M * 0 0 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +A 133 * 0 0 * chr2 1 0 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +B 99 chr1 1 255 10M = 26 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +B 147 chr1 26 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +C 99 chr2 1 255 10M = 26 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +C 147 chr1 😀6 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +D 99 chr3 1 255 10M = 25 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +D 147 chr3 26 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +E 99 chr1 2 255 10M = 15 30 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +E 147 chr1 15 255 10M = 2 -30 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 diff --git a/src/test/resources/htsjdk/samtools/roundtrip_with_utf8_bad_3.sam b/src/test/resources/htsjdk/samtools/roundtrip_with_utf8_bad_3.sam new file mode 100644 index 0000000000..9a64c14bc8 --- /dev/null +++ b/src/test/resources/htsjdk/samtools/roundtrip_with_utf8_bad_3.sam @@ -0,0 +1,18 @@ +@HD VN:1.6 SO:unsorted +@SQ SN:chr1 LN:101 +@SQ SN:chr2 LN:101 +@SQ SN:chr3 LN:101 DS:Emoji😊 +@RG ID:0 SM:Hi,Mom! DS:Kanjiアメリカ +@RG ID:rg1 PL:ILLUMINA SM:sm1 +@PG ID:33 CL:äカ DS:😀リ +@CO Kanjiアメリカ😀リä +A 73 chr2 1 255 10M * 0 0 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +A 133 * 0 0 * chr2 1 0 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +B 99 chr1 1 255 10M = 26 35 CAACAGAAGC )'.*.+2,)) R😀:Z:rg1 +B 147 chr1 26 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +C 99 chr2 1 255 10M = 26 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +C 147 chr1 😀6 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +D 99 chr3 1 255 10M = 25 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +D 147 chr3 26 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +E 99 chr1 2 255 10M = 15 30 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 +E 147 chr1 15 255 10M = 2 -30 CAACAGAAGC )'.*.+2,)) RG:Z:rg1 From 747dbae4eb219f9fc481800b950f9410c604e970 Mon Sep 17 00:00:00 2001 From: yash-puligundla Date: Mon, 4 Oct 2021 16:01:40 -0400 Subject: [PATCH 2/3] test --- src/main/java/htsjdk/samtools/util/AsciiWriter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/htsjdk/samtools/util/AsciiWriter.java b/src/main/java/htsjdk/samtools/util/AsciiWriter.java index fe3fda7aac..c0bfb932e4 100644 --- a/src/main/java/htsjdk/samtools/util/AsciiWriter.java +++ b/src/main/java/htsjdk/samtools/util/AsciiWriter.java @@ -73,7 +73,7 @@ public void flush() throws IOException { @Override public void write(final char[] chars, int off, int len) throws IOException { String str = new String(chars,off,len); - byte[] b = str.getBytes(StandardCharsets.UTF_8); + byte[] b = str.getBytes(StandardCharsets.UTF_8);// int bufferLength = b.length; os.write(b, 0, bufferLength); } From c619ba692f08d9be1693d932bdce3d41aa9cd8a4 Mon Sep 17 00:00:00 2001 From: yash-puligundla Date: Mon, 4 Oct 2021 16:01:58 -0400 Subject: [PATCH 3/3] revert --- src/main/java/htsjdk/samtools/util/AsciiWriter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/htsjdk/samtools/util/AsciiWriter.java b/src/main/java/htsjdk/samtools/util/AsciiWriter.java index c0bfb932e4..fe3fda7aac 100644 --- a/src/main/java/htsjdk/samtools/util/AsciiWriter.java +++ b/src/main/java/htsjdk/samtools/util/AsciiWriter.java @@ -73,7 +73,7 @@ public void flush() throws IOException { @Override public void write(final char[] chars, int off, int len) throws IOException { String str = new String(chars,off,len); - byte[] b = str.getBytes(StandardCharsets.UTF_8);// + byte[] b = str.getBytes(StandardCharsets.UTF_8); int bufferLength = b.length; os.write(b, 0, bufferLength); }