-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalyse_mutations.py
executable file
·277 lines (217 loc) · 12.6 KB
/
analyse_mutations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
#!/usr/bin/env python3
import os
import argparse
import sys
import os.path
import re
#Change this path to import the different program used from an other directory
sys.path.insert(1, '/lib/')
import countFrequenceFromSRS
import tag_handling
import transform_tag
import creation_logo
import countFrequenceFromSRS_with2PAM
import compare_srs_file
parser = argparse.ArgumentParser()
parser.add_argument('-s','--score', help = 'choose a minimal score of alignment between 0 and 350',default=200,choices=range(0,351),metavar="[0-350]" ,type = int)
parser.add_argument('-f','--forward', help = 'path to the forward sequence file (required)', type = str,required=True)
parser.add_argument('-r','--reverse', help = 'path to the reverse sequence file (required)',type = str, required=True)
parser.add_argument('-t','--target',help= 'path to the target sequence (required)',type =str, required=True,nargs='+')
parser.add_argument('-p','--primer',help ='path to the primers sequence in fasta format (required) as to be the same number of file as the sequence target',type = str , required =True,nargs='+')
parser.add_argument('-pam',help ='position of the first base of the pam sequence as to be calculate from the sequence without the primers (required)',type = int,required = True,nargs='+')
parser.add_argument('-a','--adapters',help ='path to the adapters (required)',type = str,required = True)
parser.add_argument('-o','--directory', help ='name of results directory',type =str, default = 'output_program')
parser.add_argument('-cutoff',help = 'Cutoff of frequency for the logo', type = float, default =0.0001)
parser.add_argument('-d','--direction',help='Give the direction of the pam sequence (use in order to calculate the CRISPR range', choices=['sens','antisens','forward','reverse'],required = True,nargs='+',type = str)
args = parser.parse_args()
#raise an exception if pam position as more than two number
if len(args.pam) > 2 :
print("Pam argument take one or two position but not more")
sys.exit(1)
score = args.score
read1 = args.forward
read2 = args.reverse
sequence_target = args.target
primers = args.primer
pam_position = args.pam
adapters = args.adapters
directory = args.directory
cutoff = args.cutoff
direction = args.direction
#raise an error if the number of primer and sequence target are not equal
if len(primers) != len(sequence_target):
print("Number of primers and sequence target file as to be the same")
sys.exit(1)
# raise an error if the cutoff is superior to 1
if cutoff > 1 :
print("Cutoff cannot be superior to 1")
sys.exit(1)
# raise an error if the number of pam position and the number of direction are not equal
if len(pam_position) != len(direction):
print("Number of pam position and direction have to be the same")
sys.exit(1)
#get working directory
path=os.path.abspath(sys.argv[0])
scriptDir=os.path.dirname(path)
#Path to the different program used
PathToPEAR='/usr/local/bin/pear'
PathToFASTQ_MCF='/usr/bin/fastq-mcf'
PathToFASTQC='/usr/bin/fastqc'
PathToBowtieBuild='/usr/bin/bowtie2-build'
PathToBowtie2='/usr/bin/bowtie2'
PathToMatrix='/home/jjust01/src/napoly/repeats/bin/EDNAFULL83'
#function to count the number of reads containing a N in the sequence
def count_number_sequence_withN(fileIN):
regex=re.compile(r"^[ATCGN]*N[ATCGN]*$")
count=0
for i in open(fileIN,'r'):
if regex.match(i):
count+=1
return(count)
#function to count the number of reads in a fasta format
def count_number_sequence(fileIN):
regex=re.compile(r"^[ATCGN]*$")
count=0
for i in open(fileIN,'r'):
if regex.match(i):
count+=1
return(count)
#function to count the number of line in a file
def count_nb_lines(fileIN):
count = -1
for i in open(fileIN,'r'):
count += 1
return(count)
#create log directory
path_log=scriptDir+'/'+directory
if os.path.exists(path_log) :
if len(os.listdir(path_log)) != 0 :
print('\n#############################\nWARNING : DIRECTORY NOT EMPTY\n#############################\n')
else :
os.makedirs(path_log,exist_ok=True)
#get filename W/O extension
filename=os.path.basename(read1)
filename_wo_extension=filename.split('.')[0]
print('\n######################\nSTARTING PEAR\n######################\n')
os.makedirs(path_log+"/PEAR", exist_ok =True)
Pear_output_file=path_log+'/PEAR/'+filename_wo_extension
#assembled paired end sequences
os.system(PathToPEAR+" -f "+read1+" -r "+read2+" -o "+Pear_output_file+" -y 2G -p 0.001 -j10")
#count the percent of reads unassembled
count_of_lines_Pear_output = sum(1 for l in open(Pear_output_file+'.assembled.fastq'))
extension=os.path.splitext(read1)
if extension[1]=='.gz':
os.system("gunzip "+read1)
read1=extension[0]
count_of_lines_Input_file=sum(1 for l in open(read1))
os.system("gzip "+read1)
read1=sys.argv[1]
else:
count_of_lines_Input_file=sum(1 for l in open(read1))
file_count_percent_of_reads_kept=path_log+"/stderr.out"
if int(count_of_lines_Pear_output)/int(count_of_lines_Input_file)<0.87:
print("Percent of assembly read is inferior to the cutoff (0.87), check the manual of PEAR to see if a solution exist")
sys.exit(1)
else :
with open(file_count_percent_of_reads_kept,'w')as read_kept:
read_kept.write("########################\nAssembly step using PEAR\n########################\n")
read_kept.write("Number of lines from input file : "+str(count_of_lines_Input_file/4)+'\n')
read_kept.write("Number of reads from PEAR output file : "+str(count_of_lines_Pear_output/4)+'\n')
read_kept.write("Percent reads assembly: "+str(count_of_lines_Pear_output/count_of_lines_Input_file)+"\n")
print('\n######################\nSTARTING FASTQC AND CUT ADAPTERS\n######################\n')
os.makedirs(path_log+"/QC", exist_ok=True)
Input_file_QC=path_log+"/PEAR/"+filename_wo_extension+".assembled.fastq"
Output_file_html=filename_wo_extension+".assembled.trimmed.fastq.html"
Output_file_trimmed=filename_wo_extension+".assembled.trimmed.fastq"
print("#### Trimming phase ####")
os.system(PathToFASTQ_MCF+" -l 100 -o "+path_log+"/QC/"+Output_file_trimmed+" "+adapters+" "+Input_file_QC+" -q 0")
print("#### FASTQC phase ####")
os.system(PathToFASTQC+" "+path_log+"/QC/"+Output_file_trimmed)
number_of_reads_wN_kept=count_number_sequence(path_log+"/QC/"+Output_file_trimmed)
number_of_reads_from_PEAR=count_number_sequence(Pear_output_file+".assembled.fastq")
with open(file_count_percent_of_reads_kept,'a') as read_kept:
read_kept.write("#######################\nTrimming with FASTQ-MCF\n#######################\n")
read_kept.write("Number of reads kept after trimming : "+str(number_of_reads_wN_kept)+'\n')
read_kept.write("Percent of reads kept after trimming :"+str(number_of_reads_wN_kept/number_of_reads_from_PEAR)+"\n")
read_kept.write("Report FASTQC can be found in QC/ repository\n")
print('\n###############\nTAG HANDLING\n#################\n')
os.makedirs(path_log+"/TAG", exist_ok=True)
Input_file_tag=path_log+"/QC/"+filename_wo_extension+".assembled.trimmed.fastq"
Output_file_with_N=path_log+"/QC/"+filename_wo_extension+"withN.assembled.trimmed.fastq"
Output_file_tag=path_log+"/TAG/"+filename_wo_extension+".assembled.trimmed.untagged.fastq"
number_of_sequencewithN=count_number_sequence_withN(Input_file_tag)
number_of_sequence_total=count_number_sequence(Input_file_tag)
if number_of_sequencewithN > 0 :
os.system(" grep -B1 -A2 ^[ATCGN]*N[ATCGN]*$ "+Input_file_tag+" > temporary_file")
os.system(" sed '/^--$/d' temporary_file > "+Output_file_with_N)
os.system(" grep -B1 -A2 ^[ATCG]*$ "+Input_file_tag+"> temporary_file")
os.system( "sed '/^--$/d' temporary_file > "+Input_file_tag)
os.system("rm temporary_file")
with open(file_count_percent_of_reads_kept,'a') as read_kept:
read_kept.write("############\nTAG handling\n############\n")
read_kept.write("Number of sequence found with a N in it (these sequences will be eliminated for the analyse but kept in a file in the QC folder ):"+str(number_of_sequencewithN)+"\n")
if number_of_sequencewithN/number_of_sequence_total > 0.01 :
read_kept.write("Warning number of sequence with a N in it is superior to 1% :"+str(number_of_sequencewithN/number_of_sequence_total)+"\n")
else:
read_kept.write("Percent of sequence with a N : "+str(number_of_sequencewithN/number_of_sequence_total)+"\n")
else :
with open(file_count_percent_of_reads_kept,'a') as read_kept:
read_kept.write("No sequence with N found\n")
temporary_file = path_log+"/TAG/temp.txt"
output_file_count_tag = path_log+"/TAG/"+filename_wo_extension+".count_untagged.fasta"
os.system(r'''grep ^[ATCGN]*$ '''+Input_file_tag+r''' |sort|uniq -c|awk -F' ' 'BEGIN {OFS=""} {print $2,":",$1}'|sed 's/^\ *//g' > '''+temporary_file)
tag_handling.handle_tag(temporary_file,output_file_count_tag)
os.system('rm '+temporary_file)
nb_unique_seq = count_nb_lines(output_file_count_tag)
with open(file_count_percent_of_reads_kept,'a') as read_kept:
read_kept.write("Number of unique sequence : "+str(nb_unique_seq)+"\nThis file can be found in TAG/ repository\n")
print('handling tags done')
print('\n###############\nCOLLAPSE AND COUNTING TAGS\n#################\n')
os.makedirs(path_log+'/collapser', exist_ok =True)
CollapseFile=path_log+'/collapser/'+filename_wo_extension+".assembled.trimmed.untagged.collapser.fasta"
outputFile=path_log+'/count/'+filename_wo_extension+".collapseTagcount.txt"
transform_tag.write_file(output_file_count_tag, CollapseFile)
print('collapse and counting tags done')
print('\n###############\nALIGNMENT WITH NEEDLEMAN & WUNSCH\n#################\n')
#repository creation
os.makedirs(path_log+'/Aligner',exist_ok = True)
fileINA=path_log+'/collapser/'+filename_wo_extension+'.assembled.trimmed.untagged.collapser.fasta'
#We realigne the sequence with 2 or more insertions/deletions with a needleman & wunsch algorithm
for seq_tar in sequence_target :
print(seq_tar)
seq_name = os.path.basename(seq_tar).split('.')[0]
sequence_realigned=path_log+'/Aligner/'+seq_name+'.aligned.needle.srs'
print(sequence_realigned)
os.system("needle -snucleotide1 Y -sformat fasta -asequence "+seq_tar+" -snucleotide2 Y -sformat2 fasta -bsequence "+fileINA+" -datafile "+PathToMatrix+" -gapopen 10.0 -gapextend 0.5 -awidth 99999 -outfile "+sequence_realigned)
with open(file_count_percent_of_reads_kept,'a') as read_kept:
read_kept.write("#################\nAlignment with NW\n#################\nAlignment with Needlemman & Wunsch done, the alignment file can be found in Aligner/ repository\n")
if len(sequence_target) > 1:
aln = {}
for srs_file in os.listdir(path_log+'/Aligner/') :
file_srs = path_log+'/Aligner/'+srs_file
new_file_srs = path_log+'/Aligner/'+srs_file.split('.')[0]+'.best_ali_score.srs'
aln = compare_srs_file.readfiles(aln,file_srs,new_file_srs)
os.system('rm '+file_srs)
compare_srs_file.compare_score(aln)
print('\n###################\nFREQUENCY CALCULATION\n######################')
#repository creation
os.makedirs(path_log+'/frequence', exist_ok =True)
frequence_mutations_file = path_log+'/frequence/'+filename_wo_extension+'_frequence_mut.txt'
os.makedirs(path_log+'/logo', exist_ok = True)
with open(file_count_percent_of_reads_kept,'a') as read_kept :
read_kept.write("#####################\nFrequency calculation\n#####################\n")
for i in range(len(os.listdir(path_log+'/Aligner/'))) :
frequence_mutations_file = path_log+'/frequence/'+os.path.basename(sequence_target[i]).split('.')[0]+'_frequence_mut.txt'
print(frequence_mutations_file)
fileIn = path_log+'/Aligner/'+os.listdir(path_log+'/Aligner/')[i]
if len(pam_position) == 2:
countFrequenceFromSRS_with2PAM.write_mutations(frequence_mutations_file,sequence_target[i],fileIn,pam_position,primers[i],file_count_percent_of_reads_kept,score,direction)
else :
countFrequenceFromSRS.write_mutations(frequence_mutations_file,sequence_target[i],fileIn,pam_position,primers[i],file_count_percent_of_reads_kept,score,direction)
long_file_mut = path_log+'/frequence/'+os.path.basename(sequence_target[i]).split('.')[0]+'_frequence_mut.long.txt'
nb_mut = count_nb_lines(long_file_mut)
with open(file_count_percent_of_reads_kept,'a') as read_kept:
read_kept.write("Number of unique mutations : "+os.path.basename(sequence_target[i]).split('.')[0]+' '+str(nb_mut)+'\n')
output_logo = path_log+'/logo/'+os.path.basename(sequence_target[i]).split('.')[0]+'logo.svg'
creation_logo.construct_logo(long_file_mut,primers[i],sequence_target[i],cutoff,output_logo,pam_position,direction)
print('\n###################\nCREATION LOGO\n#####################')