Skip to content

Commit 074e95e

Browse files
committed
Add find_in_reference tool
Updated for python3.
1 parent 7fcdca7 commit 074e95e

8 files changed

+367
-0
lines changed

find_in_reference/.shed.yml

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
categories:
2+
- Text Manipulation
3+
- Sequence Analysis
4+
description: Find and filter sequences by subsequences
5+
long_description: |
6+
7+
Filters lines of a tabular input file by checking if the selected input column
8+
is a substring of the selected column of any line in the reference file.
9+
This can be used to check if peptides sequences are present in a set of reference proteins,
10+
as a means of filtering out uninteresting peptide sequences.
11+
12+
name: find_in_reference
13+
owner: jjohnson
14+
remote_repository_url: https://github.com/jj-umn/galaxytools/tree/master/find_in_reference
15+
type: unrestricted
+177
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
#!/usr/bin/env python3
2+
3+
4+
import os.path
5+
import sys
6+
import optparse
7+
8+
9+
"""
10+
#
11+
#------------------------------------------------------------------------------
12+
# University of Minnesota
13+
# Copyright 2013, Regents of the University of Minnesota
14+
#------------------------------------------------------------------------------
15+
# Author:
16+
#
17+
# James E Johnson
18+
#
19+
#------------------------------------------------------------------------------
20+
"""
21+
22+
"""
23+
Takes 2 tabular files as input:
24+
1. The file to be filtered
25+
2. The reference file
26+
27+
The string value of selected column of the input file is searched for
28+
in the string values of the selected column of the reference file.
29+
30+
The intended purpose is to filter a peptide fasta file in tabular format
31+
by whether those peptide sequences are found in a reference fasta file.
32+
33+
"""
34+
35+
36+
def __main__():
37+
# Parse Command Line
38+
parser = optparse.OptionParser()
39+
parser.add_option('-i', '--input', dest='input', help='The input file to filter. (Otherwise read from stdin)')
40+
parser.add_option('-r', '--reference', dest='reference', help='The reference file to filter against')
41+
parser.add_option('-o', '--output', dest='output', help='The output file for input lines filtered by reference')
42+
parser.add_option('-f', '--filtered', dest='filtered', help='The output file for input lines not in the output')
43+
parser.add_option('-c', '--input_column', dest='input_column', type="int", default=None, help='The column for the value in the input file. (first column = 1, default to last column)')
44+
parser.add_option('-C', '--reference_column', dest='reference_column', type="int", default=None, help='The column for the value in the reference file. (first column = 1, default to last column)')
45+
parser.add_option('-I', '--case_insensitive', dest='ignore_case', action="store_true", default=False, help='case insensitive')
46+
parser.add_option('-R', '--reverse_find', dest='reverse_find', action="store_true", default=False, help='find the reference string in the input string')
47+
parser.add_option('-B', '--test_reverse', dest='test_reverse', action="store_true", default=False, help='Also search for reversed input string in reference')
48+
parser.add_option('-D', '--test_dna_reverse_complement', dest='test_reverse_comp', action="store_true", default=False, help='Also search for the DNA reverse complement of input string')
49+
parser.add_option('-k', '--keep', dest='keep', action="store_true", default=False, help='')
50+
parser.add_option('-a', '--annotation_columns', dest='annotation_columns', default=None, help='If string is found, add these columns from reference')
51+
parser.add_option('-s', '--annotation_separator', dest='annotation_separator', default=';', help='separator character between annotations from different lines')
52+
parser.add_option('-S', '--annotation_col_sep', dest='annotation_col_sep', default=', ', help='separator character between annotation column from the same line')
53+
parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout')
54+
(options, args) = parser.parse_args()
55+
56+
# revcompl = lambda x: ''.join([{'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'a': 't', 'c': 'g', 'g': 'c', 't': 'a', 'N': 'N', 'n': 'n'}[B] for B in x][: : -1])
57+
58+
COMP = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'a': 't', 'c': 'g', 'g': 'c', 't': 'a', 'N': 'N', 'n': 'n'}
59+
60+
def revcompl(seq):
61+
return ''.join([COMP[B] for B in seq][::-1])
62+
63+
def test_rcomplement(seq, target):
64+
if options.test_reverse_comp:
65+
try:
66+
comp = revcompl(seq)
67+
return comp in target
68+
except Exception:
69+
pass
70+
return False
71+
72+
def test_reverse(seq, target):
73+
return options.test_reverse and seq and seq[::-1] in target
74+
75+
# Input files
76+
if options.input is not None:
77+
try:
78+
inputPath = os.path.abspath(options.input)
79+
inputFile = open(inputPath, 'r')
80+
except Exception as e:
81+
print("failed: %s" % e, file=sys.stderr)
82+
exit(2)
83+
else:
84+
inputFile = sys.stdin
85+
# Reference
86+
if options.reference is None:
87+
print("failed: reference file is required", file=sys.stderr)
88+
exit(2)
89+
# Output files
90+
outFile = None
91+
filteredFile = None
92+
if options.filtered is None and options.output is None:
93+
# write to stdout
94+
outFile = sys.stdout
95+
else:
96+
if options.output is not None:
97+
try:
98+
outPath = os.path.abspath(options.output)
99+
outFile = open(outPath, 'w')
100+
except Exception as e:
101+
print("failed: %s" % e, file=sys.stderr)
102+
exit(3)
103+
if options.filtered is not None:
104+
try:
105+
filteredPath = os.path.abspath(options.filtered)
106+
filteredFile = open(filteredPath, 'w')
107+
except Exception as e:
108+
print("failed: %s" % e, file=sys.stderr)
109+
exit(3)
110+
incol = -1
111+
if options.input_column and options.input_column > 0:
112+
incol = int(options.input_column)-1
113+
refcol = -1
114+
if options.reference_column and options.reference_column > 0:
115+
refcol = int(options.reference_column)-1
116+
if options.annotation_columns:
117+
annotate = True
118+
annotation_columns = [int(x) - 1 for x in options.annotation_columns.split(', ')]
119+
else:
120+
annotate = False
121+
refFile = None
122+
num_found = 0
123+
num_novel = 0
124+
for ln, line in enumerate(inputFile):
125+
annotations = []
126+
try:
127+
found = False
128+
search_string = line.split('\t')[incol].rstrip('\r\n')
129+
if options.ignore_case:
130+
search_string = search_string.upper()
131+
if options.debug:
132+
print("search: %s" % (search_string), file=sys.stderr)
133+
refFile = open(options.reference, 'r')
134+
for tn, fline in enumerate(refFile):
135+
fields = fline.split('\t')
136+
target_string = fields[refcol].rstrip('\r\n')
137+
if options.ignore_case:
138+
target_string = target_string.upper()
139+
search = search_string if not options.reverse_find else target_string
140+
target = target_string if not options.reverse_find else search_string
141+
if options.debug:
142+
print("in: %s %s %s" % (search, search in target, target), file=sys.stderr)
143+
if search in target or test_reverse(search, target) or test_rcomplement(search, target):
144+
found = True
145+
if annotate:
146+
annotation = options.annotation_col_sep.join([fields[i] for i in annotation_columns])
147+
annotations.append(annotation)
148+
else:
149+
break
150+
if found:
151+
num_found += 1
152+
if annotate:
153+
line = '%s\t%s\n' % (line.rstrip('\r\n'), options.annotation_separator.join(annotations))
154+
if options.keep is True:
155+
if outFile:
156+
outFile.write(line)
157+
else:
158+
if filteredFile:
159+
filteredFile.write(line)
160+
else:
161+
num_novel += 1
162+
if options.keep is True:
163+
if filteredFile:
164+
filteredFile.write(line)
165+
else:
166+
if outFile:
167+
outFile.write(line)
168+
except Exception as e:
169+
print("failed: Error reading %s - %s" % (options.reference, e), file=sys.stderr)
170+
finally:
171+
if refFile:
172+
refFile.close()
173+
print("found: %d novel: %d" % (num_found, num_novel), file=sys.stdout)
174+
175+
176+
if __name__ == "__main__":
177+
__main__()
+152
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
<?xml version="1.0"?>
2+
<tool id="find_in_reference" name="find in reference" version="0.1.0">
3+
<description>filter peptides that are present in proteins</description>
4+
<command interpreter="python">find_in_reference.py --input "$input"
5+
--reference "$reference"
6+
#if $column.set == 'yes':
7+
--input_column $column.input_column
8+
--reference_column $column.reference_column
9+
#end if
10+
$case_insensitive $reverse_find $test_reverse $test_dna_reverse_complement
11+
#if 'novel' in $outputs.__str__ or not 'found' in $outputs.__str__:
12+
--output "$novel"
13+
#end if
14+
#if 'found' in $outputs.__str__:
15+
--filtered "$found"
16+
#if $annotate.from_ref == 'yes' and str($annotate.annotation_columns) != 'None':
17+
--annotation_columns $annotate.annotation_columns
18+
#if $annotate.annotation_separator != '':
19+
--annotation_separator '$annotate.annotation_separator'
20+
#end if
21+
#if $annotate.annotation_col_sep != '':
22+
--annotation_col_sep '$annotate.annotation_col_sep'
23+
#end if
24+
#end if
25+
#end if
26+
</command>
27+
<inputs>
28+
<param name="input" type="data" format="tabular" label="Input file to be filtered"
29+
help="e.g. a peptide fasta converted to tabular"/>
30+
<param name="reference" type="data" format="tabular" label="reference file to search"
31+
help="e.g. a protein fasta converted to tabular"/>
32+
<conditional name="column">
33+
<param name="set" type="select" label="select columns to compare">
34+
<option value="no" selected="true">Use last column of input and reference</option>
35+
<option value="yes">Choose the column of input and reference to compare</option>
36+
</param>
37+
<when value="no"/>
38+
<when value="yes">
39+
<param name="input_column" type="data_column" data_ref="input" label="column in input (defaults to last column)"
40+
help=""/>
41+
<param name="reference_column" type="data_column" data_ref="reference" label="column in reference (defaults to last column)"
42+
help=""/>
43+
</when>
44+
</conditional>
45+
<param name="case_insensitive" type="boolean" truevalue="--case_insensitive" falsevalue="" checked="false" label="Ignore case when comparing"/>
46+
<param name="reverse_find" type="boolean" truevalue="--reverse_find" falsevalue="" checked="false" label="reverse search: find the reference in the input" />
47+
<param name="test_reverse" type="boolean" truevalue="--test_reverse" falsevalue="" checked="false" label="Also search for reversed input string in the reference" />
48+
<param name="test_dna_reverse_complement" type="boolean" truevalue="--test_dna_reverse_complement" falsevalue="" checked="false" label="Also search for the DNA reverse complementof of the input in the reference" />
49+
<param name="outputs" type="select" multiple="true" display="checkboxes" label="Choose outputs">
50+
<option value="novel" selected="true">lines with no match in reference</option>
51+
<option value="found">lines with match in reference</option>
52+
</param>
53+
<conditional name="annotate">
54+
<param name="from_ref" type="select" label="Annotate found input entries with columns from reference">
55+
<option value="no" selected="true">No</option>
56+
<option value="yes">Yes</option>
57+
</param>
58+
<when value="no"/>
59+
<when value="yes">
60+
<param name="annotation_columns" type="data_column" data_ref="reference" multiple="true" label="columns from reference to append to found input lines"
61+
help=""/>
62+
<param name="annotation_separator" type="text" value=";" optional="true" label="separator to place between annotations from different reference lines"
63+
help="defaults to ;">
64+
<validator type="regex" message="Single quote character is not allowed">^[^']*$</validator>
65+
<sanitizer>
66+
<valid initial="string.printable">
67+
<remove value="&apos;"/>
68+
</valid>
69+
<mapping initial="none">
70+
<add source="&apos;" target=""/>
71+
</mapping>
72+
</sanitizer>
73+
</param>
74+
<param name="annotation_col_sep" type="text" value="," optional="true" label="separator to place between annotation columns from the same reference line"
75+
help="defaults to ,">
76+
<validator type="regex" message="Single quote character is not allowed">^[^']*$</validator>
77+
<sanitizer>
78+
<valid initial="string.printable">
79+
<remove value="&apos;"/>
80+
</valid>
81+
<mapping initial="none">
82+
<add source="&apos;" target=""/>
83+
</mapping>
84+
</sanitizer>
85+
</param>
86+
</when>
87+
</conditional>
88+
</inputs>
89+
<stdio>
90+
<exit_code range="1:" level="fatal" description="Error" />
91+
</stdio>
92+
<outputs>
93+
<data name="found" metadata_source="input" format_source="input" label="${tool.name} on ${on_string}: found">
94+
<filter>'found' in str(outputs)</filter>
95+
</data>
96+
<data name="novel" metadata_source="input" format_source="input" label="${tool.name} on ${on_string}: novel">
97+
<filter>'novel' in str(outputs) or not 'found' in str(outputs)</filter>
98+
</data>
99+
</outputs>
100+
<tests>
101+
<test>
102+
<param name="input" value="human_peptides.tabular" ftype="tabular" dbkey="hg19"/>
103+
<param name="reference" value="human_proteins.tabular" ftype="tabular" dbkey="hg19"/>
104+
<output name="novel" file="novel_peptides.tabular"/>
105+
</test>
106+
<test>
107+
<param name="input" value="human_proteins.tabular" ftype="tabular" dbkey="hg19"/>
108+
<param name="reference" value="human_peptides.tabular" ftype="tabular" dbkey="hg19"/>
109+
<conditional name="column">
110+
<param name="set" value="yes"/>
111+
<param name="input_column" value="2"/>
112+
<param name="reference_column" value="2"/>
113+
</conditional>
114+
<param name="reverse_find" value="True"/>
115+
<param name="outputs" value="found"/>
116+
<output name="found" file="found_proteins.tabular"/>
117+
</test>
118+
</tests>
119+
<help>
120+
**Find in Reference**
121+
122+
Filters lines of a tabular input file by checking if the selected input column value
123+
is a substring of the selected column of any line in the reference file.
124+
125+
This can be used to check if peptides sequences are present in a set of reference proteins,
126+
as a means of filtering out uninteresting peptide sequences.
127+
128+
For Example with::
129+
130+
Input
131+
>pep1 LIL
132+
>pep2 WTF
133+
>pep3 ISK
134+
135+
Reference
136+
>prot1 RLET
137+
>prot2 LLIL
138+
>prot3 LAPSE
139+
>prot3 RISKY
140+
141+
The outputs
142+
143+
Not found in reference
144+
>pep2 WTF
145+
146+
Found in reference
147+
>pep1 LIL
148+
>pep3 ISK
149+
150+
151+
</help>
152+
</tool>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pep_1|sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1 MHPAVFLSLPDLRCSLLLL
2+
pep_2|sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1 NENQVVFARVDCDQHSDIAQRYRISKY
3+
pep_3|sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1 FQKLAPSEYRYTLLRDRDEL
4+
pep_5|sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens GN=INSR PE=1 SV=4 INGQFVERCWT
5+
pep_7|sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1 MNGTEGPNFYVPFSNA
6+
pep_8|sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1 YNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1 MHPAVFLSLPDLRCSLLLLVTWVFTPVTTEITSLDTENIDEILNNADVALVNFYADWCRFSQMLHPIFEEASDVIKEEFPNENQVVFARVDCDQHSDIAQRYRISKYPTLKLFRNGMMMKREYRGQRSVKALADYIRQQKSDPIQEIRDLAEITTLDRSKRNIIGYFEQKDSDNYRVFERVANILHDDCAFLSAFGDVSKPERYSGDNIIYKPPGHSAPDMVYLGAMTNFDVTYNWIQDKCVPLVREITFENGEELTEEGLPFLILFHMKEDTESLEIFQNEVARQLISEKGTINFLHADCDKFRHPLLHIQKTPADCPVIAIDSFRHMYVFGDFKDVLIPGKLKQFVFDLHSGKLHREFHHGPDPTDTAPGEQAQDVASSPPESSFQKLAPSEYRYTLLRDRDEL
2+
sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens GN=INSR PE=1 SV=4 MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTRLHELENCSVIEGHLQILLMFKTRPEDFRDLSFPKLIMITDYLLLFRVYGLESLKDLFPNLTVIRGSRLFFNYALVIFEMVHLKELGLYNLMNITRGSVRIEKNNELCYLATIDWSRILDSVEDNYIVLNKDDNEECGDICPGTAKGKTNCPATVINGQFVERCWTHSHCQKVCPTICKSHGCTAEGLCCHSECLGNCSQPDDPTKCVACRNFYLDGRCVETCPPPYYHFQDWRCVNFSFCQDLHHKCKNSRRQGCHQYVIHNNKCIPECPSGYTMNSSNLLCTPCLGPCPKVCHLLEGEKTIDSVTSAQELRGCTVINGSLIINIRGGNNLAAELEANLGLIEEISGYLKIRRSYALVSLSFFRKLRLIRGETLEIGNYSFYALDNQNLRQLWDWSKHNLTITQGKLFFHYNPKLCLSEIHKMEEVSGTKGRQERNDIALKTNGDQASCENELLKFSYIRTSFDKILLRWEPYWPPDFRDLLGFMLFYKEAPYQNVTEFDGQDACGSNSWTVVDIDPPLRSNDPKSQNHPGWLMRGLKPWTQYAIFVKTLVTFSDERRTYGAKSDIIYVQTDATNPSVPLDPISVSNSSSQIILKWKPPSDPNGNITHYLVFWERQAEDSELFELDYCLKGLKLPSRTWSPPFESEDSQKHNQSEYEDSAGECCSCPKTDSQILKELEESSFRKTFEDYLHNVVFVPRKTSSGTGAEDPRPSRKRRSLGDVGNVTVAVPTVAAFPNTSSTSVPTSPEEHRPFEKVVNKESLVISGLRHFTGYRIELQACNQDTPEERCSVAAYVSARTMPEAKADDIVGPVTHEIFENNVVHLMWQEPKEPNGLIVLYEVSYRRYGDEELHLCVSRKHFALERGCRLRGLSPGNYSVRIRATSLAGNGSWTEPTYFYVTDYLDVPSNIAKIIIGPLIFVFLFSVVIGSIYLFLRKRQPDGPLGPLYASSNPEYLSASDVFPCSVYVPDEWEVSREKITLLRELGQGSFGMVYEGNARDIIKGEAETRVAVKTVNESASLRERIEFLNEASVMKGFTCHHVVRLLGVVSKGQPTLVVMELMAHGDLKSYLRSLRPEAENNPGRPPPTLQEMIQMAAEIADGMAYLNAKKFVHRDLAARNCMVAHDFTVKIGDFGMTRDIYETDYYRKGGKGLLPVRWMAPESLKDGVFTTSSDMWSFGVVLWEITSLAEQPYQGLSNEQVLKFVMDGGYLDQPDNCPERVTDLMRMCWQFNPKMRPTFLEIVNLLKDDLHPSFPEVSFFHSEENKAPESEELEMEFEDMENVPLDRSSHCQREEAGGRDGGSSLGFKRSYEEHIPYTHMNGGKKNGRILTLPRSNPS
3+
sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1 MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
pep_1|sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1 MHPAVFLSLPDLRCSLLLL
2+
pep_2|sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1 NENQVVFARVDCDQHSDIAQRYRISKY
3+
pep_3|sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1 FQKLAPSEYRYTLLRDRDEL
4+
pep_4|sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1 LLILVTWVFTPVTTEITSLDTE
5+
pep_5|sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens GN=INSR PE=1 SV=4 INGQFVERCWT
6+
pep_6|sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens GN=INSR PE=1 SV=4 YALVSLSFFRKLRLIRLET
7+
pep_7|sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1 MNGTEGPNFYVPFSNA
8+
pep_8|sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1 YNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA

0 commit comments

Comments
 (0)