Skip to content

Commit 2757c4e

Browse files
author
Felix Van der Jeugt
committed
fix detection of start/stop codons for 'complete' input
* fix/complete-start: run prodigal without meta option allow empty sequences include start and stop codons in output add last nights benchmarks to the evaluation README add quality measuring scripts compare to negative strand for dna_start_t_withstop invert s_save to avoid negative numbers negate strcmp dropme: start with start states infinity is sometimes smaller in FGS unfix start1/stop1 filenames include reverse stop codon in meta output fix ACGT typo to correct complete predictions revert "extend genes in complete genomes to start/stop codons" use corrected dna_start_t extend genes in complete genomes to start/stop codons
2 parents 7bddb73 + 437ef43 commit 2757c4e

File tree

10 files changed

+228
-29
lines changed

10 files changed

+228
-29
lines changed

.gitignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,8 @@
11
/target
2+
FragGeneScan
3+
FGS+
4+
prodigal
5+
*.ffn
6+
*.faa
7+
*.gff
8+
*.csv

meta/evaluation/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
*.fasta
2+
*.txt

meta/evaluation/README.md

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# Evaluation of FGS, FGSrs, FGS+, Prodigal on whole genomes
2+
3+
Source assembly: https://www.ebi.ac.uk/ena/browser/view/GCA_001628815?show=chromosomes
4+
5+
The 'FASTA' download `ena_data_20210917-1328.fasta` is the complete assembly.
6+
7+
The 'TEXT' download `ena_data_20210917-1328.txt` also contains annotated genes.
8+
9+
## Create the annotations and lengths files
10+
11+
Execute `annotations.py`.
12+
13+
## Create the FGS/FGS+ files (from .aa)
14+
15+
(swap directories to execute these)
16+
17+
```sh
18+
cd path/to/FGS
19+
./FragGeneScan -s ~-/ena_data_20210917-1328.fasta -o ~-/FGS -t complete -w 1
20+
./FGS+ -s ~-/ena_data_20210917-1328.fasta -o ~-/FGS+ -t complete -w 1
21+
cd -
22+
rm FGS.out FGS.ffn
23+
sed -n 's/^>ENA|\([^|]*\)|.*_\([0-9]*\)_\([0-9]*\)_\([+-]\)$/\1,\2,\3,\4/p' FGS.faa > FGS.csv
24+
sed -n 's/^>ENA|\([^|]*\)|.*_\([0-9]*\)_\([0-9]*\)_\([+-]\)$/\1,\2,\3,\4/p' FGS+.faa > FGS+.csv
25+
```
26+
27+
## Create the FGSrs/Prodigal files (from .gff)
28+
29+
```sh
30+
FragGeneScanRs -s ena_data_20210917-1328.fasta -g FGSrs.gff -t complete -w 1
31+
prodigal -i ena_data_20210917-1328.fasta -f gff -o prodigal.gff
32+
grep -v '^#' FGSrs.gff | tr '\t' ',' | cut -d, -f1,4,5,7 | sed 's/ENA|//;s/|[^,]*,/,/' > FGSrs.csv
33+
grep -v '^#' prodigal.gff | tr '\t' ',' | cut -d, -f1,4,5,7 | sed 's/ENA|//;s/|[^,]*,/,/' > prodigal.csv
34+
```
35+
36+
## Print comparison table
37+
38+
Execute `rates.py`.
39+
40+
## Timings for these predictions using [hyperfine](https://github.com/sharkdp/hyperfine)
41+
42+
Run in the FGS or FGS+ directory (for the training files).
43+
44+
```sh
45+
hyperfine 'FragGeneScan -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS -t complete -w 1' \
46+
'FGS+ -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS+ -t complete -w 1' \
47+
'FragGeneScanRs -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGSrs -t complete -w 1' \
48+
'prodigal -i meta/evaluation/ena_data_20210917-1328.fasta -f gff -o meta/evaluation/prodigal.gff'
49+
```
50+
51+
```
52+
Benchmark #1: ./FragGeneScan -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS -t complete -w 1
53+
Time (mean ± σ): 3.797 s ± 0.006 s [User: 3.413 s, System: 0.348 s]
54+
Range (min … max): 3.792 s … 3.807 s 5 runs
55+
56+
Benchmark #2: ./FGS+ -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS+ -t complete -w 1
57+
Time (mean ± σ): 369.979 s ± 25.774 s [User: 367.679 s, System: 0.517 s]
58+
Range (min … max): 353.713 s … 415.649 s 5 runs
59+
60+
Benchmark #1: FragGeneScanRs -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGSrs -t complete -w 1
61+
Time (mean ± σ): 1.703 s ± 0.014 s [User: 1.395 s, System: 0.275 s]
62+
Range (min … max): 1.684 s … 1.719 s 5 runs
63+
64+
Benchmark #4: prodigal -i meta/evaluation/ena_data_20210917-1328.fasta -f gff -o meta/evaluation/prodigal.gff
65+
Time (mean ± σ): 8.533 s ± 0.038 s [User: 8.453 s, System: 0.047 s]
66+
Range (min … max): 8.493 s … 8.573 s 5 runs
67+
```

meta/evaluation/annotations.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
with open('ena_data_20210917-1328.txt') as f, open('annotations.csv', 'w') as a, open('readlengths.csv', 'w') as l:
2+
for line in f:
3+
if line.startswith('AC'):
4+
accession = line[2:].strip()[:-1]
5+
elif line.startswith('FT gene'):
6+
span = line[9:].strip()
7+
if span.startswith('complement('):
8+
span = span[11:-1]
9+
strand = '-'
10+
else:
11+
strand = '+'
12+
start, end = span.split('..')
13+
print(accession, start, end, strand, sep=',', file=a)
14+
elif line.startswith('SQ'):
15+
parts = line[2:].strip().split(' ')
16+
assert parts[0] == "Sequence"
17+
print(accession, parts[1], sep=',', file=l)
18+
assert parts[2].startswith("BP")

meta/evaluation/rates.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
#!/usr/bin/env python
2+
from collections import namedtuple
3+
from operator import itemgetter
4+
5+
Annotation = namedtuple('Annotation', ['name', 'start', 'end', 'strand'])
6+
Event = namedtuple('Event', ['location', 'annotation', 'prediction'])
7+
8+
def parse_readlengths(filename):
9+
with open(filename) as f:
10+
for line in f:
11+
name, length = line.strip().split(',')
12+
yield name, int(length)
13+
14+
def parse_csv(filename):
15+
with open(filename) as f:
16+
for line in f:
17+
name, start, end, strand = line.strip().split(',')
18+
yield Annotation(name, int(start), int(end), 1 if strand == '+' else -1)
19+
20+
def events(read_lengths, annotations, predictions):
21+
names = dict()
22+
total = 0
23+
for name, length in parse_readlengths(read_lengths):
24+
names[name] = total
25+
total += length
26+
27+
for name, start, end, strand in parse_csv(annotations):
28+
yield Event(names[name] + start, strand, None)
29+
yield Event(names[name] + end, 0, None)
30+
31+
for name, start, end, strand in parse_csv(predictions):
32+
yield Event(names[name] + start, None, strand)
33+
yield Event(names[name] + end, None, 0)
34+
35+
def rates(read_lengths, annotations, predictions):
36+
rates = dict(tp=0, fp=0, tn=0, fn=0)
37+
cl = 0 # current location
38+
ca = 0 # current annotation
39+
cp = 0 # current prediction
40+
for l, a, p in sorted(events(read_lengths, annotations, predictions), key=itemgetter(0)):
41+
if ca == 0 and cp == 0:
42+
rates['tn'] += l - cl
43+
elif ca == cp:
44+
rates['tp'] += l - cl
45+
elif ca == 0 and cp != 0:
46+
rates['fp'] += l - cl
47+
elif ca != 0 and cp == 0:
48+
rates['fn'] += l - cl
49+
else: # different strands
50+
rates['fp'] += l - cl
51+
52+
if a is not None: ca = a
53+
if p is not None: cp = p
54+
cl = l
55+
return rates
56+
57+
body = '{:<10}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}'
58+
head = '{:<10}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}'
59+
print(head.format('tool', 'TP', 'FP', 'TN', 'FN', 'precision', 'sensitivity', 'specificity', 'NPV', 'MCC'))
60+
for tool in ['FGS', 'FGS+', 'prodigal', 'FGSrs']:
61+
r = rates('readlengths.csv', 'annotations.csv', f'{tool}.csv')
62+
tp, fp, tn, fn = r['tp'], r['fp'], r['tn'], r['fn']
63+
t = tp + fp + tn + fn
64+
print(body.format(
65+
tool,
66+
tp / t,
67+
fp / t,
68+
tn / t,
69+
fn / t,
70+
tp / (tp + fp),
71+
tp / (tp + fn),
72+
tn / (tn + fp),
73+
tn / (tn + fn),
74+
(tp * tn - fp * fn) / ((tp + fp)*(tp + fn)*(tn + fp)*(tn + fn))**0.5
75+
))

meta/out-to-gff.awk

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
BEGIN { print "##gff-version 3"; }
2+
{
3+
s = substr($1, 1, 1)
4+
if (s == ">") {
5+
seqid = substr($1, 2)
6+
} else {
7+
s = split($0, t, "\t")
8+
id = "ID=" seqid "_" t[1] "_" t[2] "_" t[3] ";product=predicted protein"
9+
print seqid "\tFGS\tCDS\t" t[1] "\t" t[2] "\t.\t" t[3] "\t" int(t[4] - 1) "\t" id
10+
}
11+
}

src/bin/FragGeneScanRs.rs

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ use rayon::ThreadPoolBuilder;
2323

2424
extern crate frag_gene_scan_rs;
2525
use frag_gene_scan_rs::dna::{count_cg_content, Nuc};
26+
use frag_gene_scan_rs::gene;
2627
use frag_gene_scan_rs::hmm;
2728
use frag_gene_scan_rs::viterbi::viterbi;
2829

@@ -236,13 +237,17 @@ fn run<R: Read + Send, W: WritingBuffer + Send>(
236237
let fasta::OwnedRecord { mut head, seq } = record?;
237238
head = head.into_iter().take_while(u8::is_ascii_graphic).collect();
238239
let nseq: Vec<Nuc> = seq.into_iter().map(Nuc::from).collect();
239-
let read_prediction = viterbi(
240-
&global,
241-
&locals[count_cg_content(&nseq)],
242-
head,
243-
nseq,
244-
whole_genome,
245-
);
240+
let read_prediction = if nseq.is_empty() {
241+
gene::ReadPrediction::new(head)
242+
} else {
243+
viterbi(
244+
&global,
245+
&locals[count_cg_content(&nseq)],
246+
head,
247+
nseq,
248+
whole_genome,
249+
)
250+
};
246251
if meta_buffer.is_some() {
247252
read_prediction.meta(&mut metabuf)?;
248253
}

src/gene.rs

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,6 @@ impl ReadPrediction {
7474

7575
pub struct Gene {
7676
pub start: usize,
77-
pub metastart: usize,
7877
pub end: usize,
7978
pub frame: usize,
8079
pub score: f64,
@@ -89,7 +88,7 @@ impl Gene {
8988
buf.append(
9089
&mut format!(
9190
"{}\t{}\t{}\t{}\t{:.6}\tI:{}\tD:{}\n",
92-
self.metastart,
91+
self.start,
9392
self.end,
9493
if self.forward_strand { '+' } else { '-' },
9594
self.frame,
@@ -112,12 +111,12 @@ impl Gene {
112111
&mut format!(
113112
"{}\tFGS\tCDS\t{}\t{}\t.\t{}\t{}\tID={}_{}_{}_{};product=predicted protein\n",
114113
head,
115-
self.metastart,
114+
self.start,
116115
self.end,
117116
if self.forward_strand { '+' } else { '-' },
118117
self.frame - 1,
119118
head,
120-
self.metastart,
119+
self.start,
121120
self.end,
122121
if self.forward_strand { '+' } else { '-' }
123122
)

src/hmm.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -149,8 +149,8 @@ pub fn get_train_from_file(
149149
read_noncoding(&mut locals, train_dir.join("noncoding"))?;
150150
read_start(&mut locals, train_dir.join("start"))?;
151151
read_stop(&mut locals, train_dir.join("stop"))?;
152-
read_start1(&mut locals, train_dir.join("start1"))?;
153-
read_stop1(&mut locals, train_dir.join("stop1"))?;
152+
read_start1(&mut locals, train_dir.join("stop1"))?; // keep FGS naming scheme
153+
read_stop1(&mut locals, train_dir.join("start1"))?; // keep FGS naming scheme
154154
read_pwm(&mut locals, train_dir.join("pwm"))?;
155155

156156
Ok((global, locals))

0 commit comments

Comments
 (0)