Skip to content

Commit

Permalink
improve BED support per hts-specs version 1.0 specification
Browse files Browse the repository at this point in the history
  • Loading branch information
heuermh committed Jul 9, 2024
1 parent 82985d4 commit 55f4fd0
Show file tree
Hide file tree
Showing 8 changed files with 410 additions and 70 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -177,12 +177,12 @@ to boolean true or false against a record, provided in the context as variable `
with `dsh-filter-bed`, to filter BED records by chromosome and score

```javascript
r.getChrom() == 1 && r.getScore() > 10.0
r.getChrom() == 1 && r.getScore() > 10
```
specified on the command line as

```bash
$ dsh-bio filter-bed -i input.bed --script "r.getChrom() == 1 && r.getScore() > 10.0"
$ dsh-bio filter-bed -i input.bed --script "r.getChrom() == 1 && r.getScore() > 10"
```


Expand Down
186 changes: 123 additions & 63 deletions feature/src/main/java/org/dishevelled/bio/feature/bed/BedRecord.java

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,16 @@ public void testValueOfThickStartLessThanZero() {
valueOf("chr1\t11873\t14409\tuc001aaa.3\t0\t+\t-1\t11873\t0\t3\t354,109,1189,\t0,739,1347,");
}

@Test(expected=IllegalArgumentException.class)
public void testValueOfThickStartLessThanStart() {
valueOf("chr1\t11873\t14409\tuc001aaa.3\t0\t+\t10873\t11873\t0\t3\t354,109,1189,\t0,739,1347,");
}

@Test(expected=IllegalArgumentException.class)
public void testValueOfThickStartGreaterThanEnd() {
valueOf("chr1\t11873\t14409\tuc001aaa.3\t0\t+\t15409\t11873\t0\t3\t354,109,1189,\t0,739,1347,");
}

@Test(expected=IllegalArgumentException.class)
public void testValueOfThickEndLessThanZero() {
valueOf("chr1\t11873\t14409\tuc001aaa.3\t0\t+\t11873\t-1\t0\t3\t354,109,1189,\t0,739,1347,");
Expand Down Expand Up @@ -120,6 +130,51 @@ public void testValueOfTooManyBlockStarts() {
valueOf("chr1\t11873\t14409\tuc001aaa.3\t0\t+\t11873\t11873\t0\t3\t354,109,1189,\t0,739,1347,42,");
}

@Test(expected=IllegalArgumentException.class)
public void testScoreTooLow() {
valueOf("chr1\t11873\t14409\tuc001aaa.3\t-1\t+\t11873\t11873\t0\t3\t354,109,1189,\t0,739,1347,");
}

@Test(expected=IllegalArgumentException.class)
public void testScoreTooHigh() {
valueOf("chr1\t11873\t14409\tuc001aaa.3\t1001\t+\t11873\t11873\t0\t3\t354,109,1189,\t0,739,1347,");
}

@Test(expected=IllegalArgumentException.class)
public void testScoreInvalid() {
valueOf("chr1\t11873\t14409\tuc001aaa.3\tinvalid\t+\t11873\t11873\t0\t3\t354,109,1189,\t0,739,1347,");
}

@Test(expected=IllegalArgumentException.class)
public void testItemRgbInvalid() {
valueOf("chr1\t11873\t14409\tuc001aaa.3\t0\t+\t11873\t11873\tinvalid\t3\t354,109,1189,\t0,739,1347,");
}

@Test(expected=IllegalArgumentException.class)
public void testBED12BlockCountZero() {
valueOf("chr1\t11873\t14409\tuc001aaa.3\t0\t+\t11873\t11873\t0\t0\t\t");
}

@Test(expected=IllegalArgumentException.class)
public void testBED12FirstBlockStart() {
valueOf("chr1\t11873\t14409\tuc001aaa.3\t0\t+\t11873\t11873\t0\t3\t353,109,1189,\t1,739,1347,");
}

@Test(expected=IllegalArgumentException.class)
public void testBED12LastBlockEnd() {
valueOf("chr1\t11873\t14409\tuc001aaa.3\t0\t+\t11873\t11873\t0\t3\t354,109,1188,\t0,739,1347,");
}

@Test(expected=IllegalArgumentException.class)
public void testBED12BlockOverlap() {
valueOf("chr1\t11873\t14409\tuc001aaa.3\t0\t+\t11873\t11873\t0\t3\t354,109,1689,\t0,739,847,");
}

@Test(expected=IllegalArgumentException.class)
public void testBED12BlocksOutOfOrder() {
valueOf("chr1\t11873\t14409\tuc001aaa.3\t0\t+\t11873\t11873\t0\t4\t354,100,100,1189,\t0,900,700,1347,");
}

@Test
public void testEquals() {
BedRecord record1 = valueOf("chr1\t11873\t14409");
Expand Down Expand Up @@ -161,7 +216,7 @@ public void testValueOfBED5() {
assertEquals(11873L, record.getStart());
assertEquals(14409L, record.getEnd());
assertEquals("uc001aaa.3", record.getName());
assertEquals("0", record.getScore());
assertEquals(0, record.getScore());
assertEquals(BedFormat.BED5, record.getFormat());
assertEquals(Range.closedOpen(11873L, 14409L), record.toRange());
assertEquals("chr1\t11873\t14409\tuc001aaa.3\t0", record.toString());
Expand All @@ -174,7 +229,7 @@ public void testValueOfBED6() {
assertEquals(11873L, record.getStart());
assertEquals(14409L, record.getEnd());
assertEquals("uc001aaa.3", record.getName());
assertEquals("0", record.getScore());
assertEquals(0, record.getScore());
assertEquals("+", record.getStrand());
assertEquals(BedFormat.BED6, record.getFormat());
assertEquals(Range.closedOpen(11873L, 14409L), record.toRange());
Expand All @@ -188,7 +243,7 @@ public void testValueOfBED12() {
assertEquals(11873L, record.getStart());
assertEquals(14409L, record.getEnd());
assertEquals("uc001aaa.3", record.getName());
assertEquals("0", record.getScore());
assertEquals(0, record.getScore());
assertEquals("+", record.getStrand());
assertEquals(11873L, record.getThickStart());
assertEquals(11873L, record.getThickEnd());
Expand All @@ -206,4 +261,45 @@ public void testValueOfBED12() {
assertEquals(Range.closedOpen(11873L, 14409L), record.toRange());
assertEquals("chr1\t11873\t14409\tuc001aaa.3\t0\t+\t11873\t11873\t0\t3\t354,109,1189\t0,739,1347", record.toString());
}

@Test
public void testValueOfBED12DefaultName() {
BedRecord record = valueOf("chr1\t11873\t14409\t\t0\t+\t11873\t11873\t0\t3\t354,109,1189,\t0,739,1347,");
assertEquals(".", record.getName());
}

@Test
public void testValueOfBED12DefaultStrand() {
BedRecord record = valueOf("chr1\t11873\t14409\tuc001aaa.3\t0\t\t11873\t11873\t0\t3\t354,109,1189,\t0,739,1347,");
assertEquals(".", record.getStrand());
}

@Test
public void testValueOfBED12DefaultScore() {
BedRecord record = valueOf("chr1\t11873\t14409\tuc001aaa.3\t\t+\t11873\t11873\t0\t3\t354,109,1189,\t0,739,1347,");
assertEquals(0, record.getScore());
}

@Test
public void testValueOfBED12DefaultItemRgb() {
BedRecord record = valueOf("chr1\t11873\t14409\tuc001aaa.3\t0\t+\t11873\t11873\t\t3\t354,109,1189,\t0,739,1347,");
assertEquals("0", record.getItemRgb());
}

@Test
public void testValueOfBED12ValidItemRgb() {
BedRecord record = valueOf("chr19\t250275\t250322\tname7\t902\t-\t250276\t250321\t128,128,0\t2\t10,10\t0,37");
assertEquals("128,128,0", record.getItemRgb());
}

@Test
public void testValueOfBed12LeadingZerosItemRgb() {
BedRecord record = valueOf("chr19\t250131\t250167\tname3\t914\t-\t250132\t250166\t000,000,000\t2\t10,10\t0,26");
assertEquals("0,0,0", record.getItemRgb());
}

@Test(expected=IllegalArgumentException.class)
public void testValueOfBed12InvalidItemRgb() {
valueOf("chr19\t250000\t250036\tname1\t889\t+\t250001\t250035\t256,128,0\t2\t10,10\t0,26");
}
}
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@
<plugins>
<plugin>
<artifactId>maven-javadoc-plugin</artifactId>
<version>3.5.0</version>
<version>3.6.3</version>
<configuration>
<detectLinks>true</detectLinks>
<show>package</show>
Expand Down
4 changes: 4 additions & 0 deletions tools/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,10 @@
<id>dsh-gfa1-to-gfa2</id>
<mainClass>org.dishevelled.bio.tools.Gfa1ToGfa2</mainClass>
</program>
<program>
<id>dsh-gff3-to-bed</id>
<mainClass>org.dishevelled.bio.tools.Gff3ToBed</mainClass>
</program>
<program>
<id>dsh-identify-gfa1</id>
<mainClass>org.dishevelled.bio.tools.IdentifyGfa1</mainClass>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ public ScoreFilter(final double score) {

@Override
public boolean accept(final BedRecord record) {
return record.getScore() != null && Double.valueOf(record.getScore()) > score;
return Double.valueOf(record.getScore()) > score;
}
}

Expand Down
179 changes: 179 additions & 0 deletions tools/src/main/java/org/dishevelled/bio/tools/Gff3ToBed.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
/*
dsh-bio-tools Command line tools.
Copyright (c) 2013-2024 held jointly by the individual authors.
This library is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published
by the Free Software Foundation; either version 3 of the License, or (at
your option) any later version.
This library is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this library; if not, write to the Free Software Foundation,
Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
> http://www.fsf.org/licensing/licenses/lgpl.html
> http://www.opensource.org/licenses/lgpl-license.php
*/
package org.dishevelled.bio.tools;

import static org.dishevelled.compress.Readers.reader;
import static org.dishevelled.compress.Writers.writer;

import java.io.BufferedReader;
import java.io.File;
import java.io.PrintWriter;

import java.nio.file.Path;

import java.util.List;

import java.util.concurrent.Callable;

import org.dishevelled.bio.feature.gff3.Gff3Listener;
import org.dishevelled.bio.feature.gff3.Gff3Reader;
import org.dishevelled.bio.feature.gff3.Gff3Record;

import org.dishevelled.bio.feature.bed.BedRecord;
import org.dishevelled.bio.feature.bed.BedWriter;

import org.dishevelled.commandline.ArgumentList;
import org.dishevelled.commandline.CommandLine;
import org.dishevelled.commandline.CommandLineParseException;
import org.dishevelled.commandline.CommandLineParser;
import org.dishevelled.commandline.Switch;
import org.dishevelled.commandline.Usage;

import org.dishevelled.commandline.argument.FileArgument;
import org.dishevelled.commandline.argument.PathArgument;

/**
* Convert transcript features in GFF3 format to BED format.
*
* @since 2.4
* @author Michael Heuer
*/
public final class Gff3ToBed implements Callable<Integer> {
private final Path inputGff3Path;
private final File outputBedFile;
private static final String USAGE = "dsh-gff3-to-bed [args]";

public Gff3ToBed(final Path inputGff3Path, final File outputBedFile) {
this.inputGff3Path = inputGff3Path;
this.outputBedFile = outputBedFile;
}

@Override
public Integer call() throws Exception {
BufferedReader reader = null;
PrintWriter writer = null;
try {
reader = reader(inputGff3Path);
writer = writer(outputBedFile);

final PrintWriter w = writer;
Gff3Reader.stream(reader, new Gff3Listener() {
@Override
public boolean record(final Gff3Record gff3Record) {
if (accept(gff3Record)) {
BedRecord bedRecord = convert(gff3Record);
BedWriter.write(bedRecord, w);
}
return true;
}
});

return 0;
}
finally {
try {
reader.close();
}
catch (Exception e) {
// ignore
}
try {
writer.close();
}
catch (Exception e) {
// ignore
}
}
}

static boolean accept(final Gff3Record gff3Record) {
return gff3Record.getAttributes().containsKey("transcript_id");
}

static BedRecord convert(final Gff3Record gff3Record) {
String chrom = gff3Record.getSeqid();
long chromStart = gff3Record.getStart();
long chromEnd = gff3Record.getEnd();

// transcript_id is present, but might be multi-valued
List<String> transcriptIds = gff3Record.getAttributes().get("transcript_id");
String transcriptId = transcriptIds.get(0);

// score may not be present
int score = gff3Record.getScore() == null ? 0 : Math.max(0, Math.min(1000, (int) Math.round(gff3Record.getScore())));

String strand = gff3Record.getStrand() == null ? "." : gff3Record.getStrand();
long thickStart = chromStart;
long thickEnd = chromEnd;
String itemRgb = "0";

// at least one block must be present
int blockCount = 1;
long[] blockSizes = new long[] { (chromEnd - chromStart) };
long[] blockStarts = new long[] { 0L };

return new BedRecord(chrom, chromStart, chromEnd, transcriptId, score, strand, thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts);
}

/**
* Main.
*
* @param args command line args
*/
public static void main(final String[] args) {
Switch about = new Switch("a", "about", "display about message");
Switch help = new Switch("h", "help", "display help message");
PathArgument inputGff3Path = new PathArgument("i", "input-gff3-path", "input GFF3 path, default stdin", false);
FileArgument outputBedFile = new FileArgument("o", "output-bed-file", "output BED file, default stdout", false);

ArgumentList arguments = new ArgumentList(about, help, inputGff3Path, outputBedFile);
CommandLine commandLine = new CommandLine(args);

Gff3ToBed gff3ToBed = null;
try
{
CommandLineParser.parse(commandLine, arguments);
if (about.wasFound()) {
About.about(System.out);
System.exit(0);
}
if (help.wasFound()) {
Usage.usage(USAGE, null, commandLine, arguments, System.out);
System.exit(0);
}
gff3ToBed = new Gff3ToBed(inputGff3Path.getValue(), outputBedFile.getValue());
}
catch (CommandLineParseException e) {
Usage.usage(USAGE, e, commandLine, arguments, System.err);
System.exit(-1);
}
try {
System.exit(gff3ToBed.call());
}
catch (Exception e) {
e.printStackTrace();
System.exit(1);
}
}
}
1 change: 1 addition & 0 deletions tools/src/main/java/org/dishevelled/bio/tools/Tools.java
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ static String[] dropFirst(final String[] args) {
.put("filter-sam", new Command("filter-sam", "filter alignments in SAM format", FilterSam.class))
.put("filter-vcf", new Command("filter-vcf", "filter variants in VCF format", FilterVcf.class))
.put("gfa1-to-gfa2", new Command("gfa1-to-gfa2", "convert GFA 1.0 format to GFA 2.0 format", Gfa1ToGfa2.class))
.put("gff3-to-bed", new Command("gff3-to-bed", "convert transcript features in GFF3 format to BED format", Gff3ToBed.class))
.put("identify-gfa1", new Command("identify-gfa1", "add identifier annotation to records in GFA 1.0 format", IdentifyGfa1.class))
.put("interleave-fastq", new Command("interleave-fastq", "convert first and second DNA sequence files in FASTQ format to interleaved FASTQ format", InterleaveFastq.class))
.put("interleaved-fastq-to-bam", new Command("interleaved-fastq-to-bam", "convert DNA sequences in interleaved FASTQ format to unaligned BAM format", InterleavedFastqToBam.class))
Expand Down

0 comments on commit 55f4fd0

Please sign in to comment.