-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_introme.sh
executable file
·481 lines (376 loc) · 24 KB
/
run_introme.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
#!/bin/bash
# Introme Bash Script
#
# Developers:
# Velimir Gayevskiy ([email protected])
# Sarah Beecroft ([email protected])
#
##################################
# CUSTOM FUNCTIONS
# Check whether a string is an array element
# Usage: stringinarray "string" "${array[@]}"
# $? == 0 if true, otherwise 1
stringinarray () {
local e match="$1"
shift
for e; do [[ "$e" == "$match" ]] && return 0; done
return 1
}
##################################
# DEFINE VARIABLES
# Directories
out_dir='./output'
# Hard filter cutoffs for each annotation, filtering thresholds will depend on your application, we urge you to carefully consider these
MGRB_AF='<=0.01' # Minimum MGRB allele frequency (healthy Australian population)
gnomad_popmax_AF='<=0.01' # Minimum gnomAD allele frequency in the population with the highest allele frequency in gnomAD
CADD_Phred='>=0' # Minimum CADD score (phred-scaled)
min_QUAL='>=200' # The QUAL VCF field
min_DP='>=20' # The sample with the highest depth must have a equal or larger depth than this value
min_AD='>=5' # The sample with the highest number of alternate reads must have a equal or larger number than this value
##################################
# INTROME ARGUMENTS
while getopts "a:b:v:p:i:r:" opt; do
case $opt in
a) affected_samples+=("$OPTARG");; # Array of affected samples
b) input_BED="$OPTARG";; # Input BED file (i.e. regions of interest)
v) input_VCF="$OPTARG";; # Input VCF file
p) prefix="$OPTARG";; # Output file prefix
i) inheritance_pattern="$OPTARG";; # Inheritance pattern to use
r) reference_genome="$OPTARG";; # Path to the reference genome used for mapping
# TODO add -g for gene and -G for gene list file
esac
done
shift $(( $OPTIND - 1 )) # Remove parsed options and args from $@ list
# Make sure each required argument has a non-empty value
if [ -z $affected_samples ]; then
echo "No affected samples supplied."
exit 1
elif [ -z $input_BED ]; then
echo "No restriction BED file supplied."
exit 1
elif [ -z $input_VCF ]; then
echo "No path to an input VCF file has been supplied."
exit 1
elif [ -z $prefix ]; then
echo "No output prefix has been supplied."
exit 1
elif [ -z $inheritance_pattern ]; then
echo "No inheritance pattern has been supplied."
exit 1
elif [ -z $reference_genome ]; then
echo "No reference genome file has been supplied."
exit 1
fi
##################################
# STEP 1: subsetting the VCF to genomic regions of interest (first because it gets rid of the most variants)
echo $(date +%x_%r) 'Beginning subsetting to genomic regions of interest'
echo $(gzip -d -c $input_VCF | grep -v '^#' | wc -l) 'variants prior to subsetting'
bedtools intersect -header -u -a $input_VCF -b $input_BED | bgzip > $out_dir/$prefix.subset.vcf.gz # -u for unique record in VCF, otherwise multiple variants are output for overlapping introns
tabix -p vcf $out_dir/$prefix.subset.vcf.gz
echo $(gzip -d -c $out_dir/$prefix.subset.vcf.gz | grep -v '^#' | wc -l) 'variants after subsetting'
echo $(date +%x_%r) 'Subsetting complete'
##################################
# STEP 2: familial filtering (second because it gets rid of the second most variants)
# If an inheritance pattern has been specified
if [[ $inheritance_pattern != "none" ]]; then
echo $(date +%x_%r) 'Beginning familial filtering'
# Capture each sample name in the VCF into an array
vcfsamples=( $(bcftools query -l $out_dir/$prefix.subset.vcf.gz) )
# Variable to hold the bcftools genotype filter string
genotype_filters=""
# Variable to hold the current VCF sample index, used in referring to the right sample in filtering
i=0
# Go through every VCF sample name (they are ordered as they are in the VCF which is important for the index)
while [[ $i -le $(( ${#vcfsamples[@]} - 1 )) ]]; do
# Code to not apply genotype filtering to unaffected samples, will implement properly soon...
#stringinarray "${vcfsamples[$i]}" "${affected_samples[@]}"
#if [ $? != 0 ]; then
# (( i++ ))
# continue
#fi
# Start of the genotype filter string
genotype_filters+=" && FORMAT/GT[$i]='"
# Run custom function to check if the current VCF sample has been supplied as an affected sample
stringinarray "${vcfsamples[$i]}" "${affected_samples[@]}"
# If the current VCF sample is affected
if [ $? == 0 ]; then
if [[ $inheritance_pattern == "heterozygous" ]]; then
genotype_filters+="0/1"
elif [[ $inheritance_pattern == "homozygous" ]]; then
genotype_filters+="1/1"
fi
# If the current VCF sample is not affected
else
if [[ $inheritance_pattern == "heterozygous" ]]; then
genotype_filters+="0/0"
elif [[ $inheritance_pattern == "homozygous" ]]; then
genotype_filters+="0/1"
fi
fi
# Close off genotype filter string
genotype_filters+="'"
# Iterate the $i variable
(( i++ ))
done
# Cut the 4 first characters off the filter string (the " && ")
genotype_filters=$(echo $genotype_filters | cut -c 4-)
bcftools filter --threads $(getconf _NPROCESSORS_ONLN) -i"$genotype_filters" $out_dir/$prefix.subset.vcf.gz | bgzip > $out_dir/$prefix.subset.inheritancefilter.vcf.gz
tabix $out_dir/$prefix.subset.inheritancefilter.vcf.gz
echo $(gzip -d -c $out_dir/$prefix.subset.inheritancefilter.vcf.gz | grep -v '^#' | wc -l) 'variants after familial filter'
echo $(date +%x_%r) 'Familial filtering complete'
# If the "none" inheritance pattern has been specified, just use the subset VCF as the inheritance filtered VCF so the rest of the pipeline carries on as normal
else
mv $out_dir/$prefix.subset.vcf.gz $out_dir/$prefix.subset.inheritancefilter.vcf.gz
mv $out_dir/$prefix.subset.vcf.gz.tbi $out_dir/$prefix.subset.inheritancefilter.vcf.gz.tbi
fi
##################################
# STEP 3: Hard filtering on variant quality (this is here to reduce the number of variants going into the CPU-costly annotation step below)
echo $(date +%x_%r) 'Beginning quality filtering'
bcftools filter --threads $(getconf _NPROCESSORS_ONLN) -i"(FILTER='PASS' || FILTER='.') && TYPE='snp' && (QUAL$min_QUAL || QUAL='.') && MAX(FORMAT/DP[*])$min_DP && MAX(FORMAT/AD[*:1])$min_AD" $out_dir/$prefix.subset.inheritancefilter.vcf.gz | bgzip > $out_dir/$prefix.subset.inheritancefilter.highquality.vcf.gz
tabix -p vcf $out_dir/$prefix.subset.inheritancefilter.highquality.vcf.gz
echo $(gzip -d -c $out_dir/$prefix.subset.inheritancefilter.highquality.vcf.gz | grep -v '^#' | wc -l) 'variants after filtering'
echo $(date +%x_%r) 'Filtering complete'
##################################
# STEP 4: annotate the subsetted VCF with useful information, to be used for filtering downstream
echo $(date +%x_%r) 'Beginning annotation'
export IRELATE_MAX_GAP=1000 # Note: this is set to speed up annotation when .csi (as opposed to .tbi) files are present via https://github.com/brentp/vcfanno/releases/
vcfanno -p $(getconf _NPROCESSORS_ONLN) conf.toml $out_dir/$prefix.subset.inheritancefilter.highquality.vcf.gz | bgzip > $out_dir/$prefix.subset.inheritancefilter.highquality.annotated.vcf.gz # The conf.toml file specifies what VCFanno should annotate
tabix -p vcf $out_dir/$prefix.subset.inheritancefilter.highquality.annotated.vcf.gz
echo $(date +%x_%r) 'Annotation complete'
##################################
# STEP 5: Hard filtering on the values of annotations added in the previous step
echo $(date +%x_%r) 'Beginning filtering by annotation values'
bcftools filter --threads $(getconf _NPROCESSORS_ONLN) -i"(MGRB_AF$MGRB_AF || MGRB_AF='.') && (gnomAD_PM_AF$gnomad_popmax_AF || gnomAD_PM_AF='.') && (Branchpointer_Branchpoint_Prob!='.' || (Branchpointer_Branchpoint_Prob='.' && (CADD_Phred$CADD_Phred || CADD_Phred='.')))" $out_dir/$prefix.subset.inheritancefilter.highquality.annotated.vcf.gz | bgzip > $out_dir/$prefix.subset.inheritancefilter.highquality.annotated.filtered.vcf.gz
tabix -p vcf $out_dir/$prefix.subset.inheritancefilter.highquality.annotated.filtered.vcf.gz
echo $(gzip -d -c $out_dir/$prefix.subset.inheritancefilter.highquality.annotated.filtered.vcf.gz | grep -v '^#' | wc -l) 'variants after filtering'
echo $(date +%x_%r) 'Filtering complete'
##################################
# STEP 6: Output final list of variants as a spreadsheet-friendly TSV
bcftools query -H -f '%CHROM\t%POS\t%REF\t%ALT\t%QUAL\t%INFO/Gene_Symbol\t%INFO/Gene_Strand\t%INFO/CADD_Phred\t%INFO/MGRB_AF\t%INFO/gnomAD_PM_AF\t%INFO/CSQ\t%INFO/SPIDEX_dPSI_Max_Tissue\t%INFO/SPIDEX_dPSI_Zscore\t%INFO/dbscSNV_AdaBoost_Score\t%INFO/dbscSNV_RandomForest_Score\t%INFO/Branchpointer_Branchpoint_Prob\t%INFO/Branchpointer_U2_Binding_Energy[\t%GT][\t%DP][\t%AD][\t%GQ]\n' $out_dir/$prefix.subset.inheritancefilter.highquality.annotated.filtered.vcf.gz > $out_dir/$prefix.introme.tsv
# Remove the '[<number]' column numbers added by bcftools query to column names
grep '^#' $out_dir/$prefix.introme.tsv | sed "s/\[[0-9]*\]//g" > $out_dir/$prefix.introme.tsv.header
grep -v '^#' $out_dir/$prefix.introme.tsv >> $out_dir/$prefix.introme.tsv.header
mv $out_dir/$prefix.introme.tsv.header $out_dir/$prefix.introme.tsv
# Sort by chromosome and coordinate
sort -k1,1n -k2,2n $out_dir/$prefix.introme.tsv > $out_dir/$prefix.introme.sorted.tsv
mv $out_dir/$prefix.introme.sorted.tsv $out_dir/$prefix.introme.tsv
##################################
# STEP 7: Calculate the maximum 3' and 5' MaxEntScan value for sliding windows around each variant
echo $(date +%x_%r) 'Beginning MaxEntScan calculation, number of variants to process: '$(cat $out_dir/$prefix.introme.tsv | wc -l)
# Number of lines processed for displaying progress to the user
num_lines_processed=0
# Go through each line of the results TSV
cat $out_dir/$prefix.introme.tsv | \
while read line; do
# If the current number of running processes is below a certain number, wait to process further lines
# Processing each line uses ~100 processes on one core (currently, this can change!) so multiply this by the number available for the number of simultaneous processes to create
while [[ $(jobs -r | wc -l) -ge $(echo "100 * $(getconf _NPROCESSORS_ONLN)" | bc -l) ]]; do
sleep 1
done
# Iterate the number of lines processed
((num_lines_processed++))
# Report progress for every 100 variants processed
if [[ $(echo $num_lines_processed | grep '00$') ]]; then
echo $(date +%x_%r) 'Processed '$num_lines_processed' variants'
fi
# Create a block for processing and calculating MaxEntScan scores which is pushed into the background at the end
{
# If the line is the header line
if [[ ${line:0:1} == "#" ]]; then
# Add to the header line
echo "$line"$'\t'"MaxEntScan_Consequence"$'\t'"5'_max_MaxEntScan_value"$'\t'"5'_max_MaxEntScan_reference_value"$'\t'"3'_max_MaxEntScan_value"$'\t'"3'_max_MaxEntScan_reference_value"$'\t'"5'_max_MaxEntScan_direction"$'\t'"3'_max_MaxEntScan_direction"$'\t'"5'_max_MaxEntScan_coordinates"$'\t'"3'_max_MaxEntScan_coordinates"$'\t'"5'_max_MaxEntScan_sequence"$'\t'"5'_max_MaxEntScan_reference_sequence"$'\t'"3'_max_MaxEntScan_sequence"$'\t'"3'_max_MaxEntScan_reference_sequence" > $out_dir/$prefix.introme.annotated.tsv
# Don't do any further processing on this line
continue
fi
# Current variant information
current_chr=$(echo "$line" | cut -f 1)
current_pos=$(echo "$line" | cut -f 2)
current_alt=$(echo "$line" | cut -f 4)
current_strand=$(echo "$line" | cut -f 7)
# Calculate the maximum range we are interested in (corresponding to the 23-base windows)
current_max_coordinate_range="$current_chr:"$(( $current_pos - 22 ))"-"$(( $current_pos + 22 ))
# Fetch the reference genome sequence once and subset it for windows
current_max_reference_context=$(samtools faidx $reference_genome $current_max_coordinate_range | grep -v '^>')
# Determine the direction in which to look for putative splice sites based on the direction of the genes with which the variant overlaps
# If the variant overlaps with both a + and - strand gene
if [[ $(echo $current_strand | grep '-') ]] && [[ $(echo $current_strand | grep '+') ]]; then
current_strand_direction_calculation='unknown'
# If the variant only overlaps with - strand gene(s)
elif [[ $(echo $current_strand | grep '-') ]]; then
current_strand_direction_calculation='reverse'
# If the variant only overlaps with + strand gene(s)
elif [[ $(echo $current_strand | grep '+') ]]; then
current_strand_direction_calculation='forward'
# If the variant does not have a strand direction associated (it probably doesn't overlap with a gene)
else
current_strand_direction_calculation='unknown'
fi
##################################
# 5' MaxEntScan score calculation
max_maxentscan_5=''
max_maxentscan_sequence_5=''
max_maxentscan_coordinates_5=''
max_maxentscan_difference_5=''
max_maxentscan_consequence_5=''
max_maxentscan_type_5=''
# $i is the offset for determining the coordinate range
for i in {0..8}; do
# Determine the putative splice site coordinate range
current_coordinate_range="$current_chr:"$(( $current_pos - 8 + $i ))"-"$(( $current_pos + $i ))
# Extract the current putative splice site sequence as in the reference genome and mutate the correct base corresponding to the variant
current_splice_site_forward=${current_max_reference_context:(( 14 + $i )):(( 8 - $i ))}$current_alt${current_max_reference_context:23:$i} # The numbers here are offsetting to ignore the full 23 base window and focus on the 9 base window
# If the variant gene is not on the reverse strand, calculate the MaxEntScan value for the splice site being on the forward strand
if [[ $current_strand_direction_calculation != "reverse" ]]; then
current_maxentscan_forward=$(echo $current_splice_site_forward | perl MaxEntScan/score5.pl - | cut -f 2)
fi
# If the variant gene is not on the forward strand, calculate reverse values too
if [[ $current_strand_direction_calculation != "forward" ]]; then
# Determine the reverse complement putative splice site sequence
current_splice_site_reverse=$(echo $current_splice_site_forward | tr "[ATCG]" "[TAGC]" | rev)
# Calculate the MaxEntScan value for the splice site being on the reverse strand
current_maxentscan_reverse=$(echo $current_splice_site_reverse | perl MaxEntScan/score5.pl - | cut -f 2)
fi
# If the variant gene is on the forward strand, or the strand is unknown but the forward value is higher than the reverse value
if [[ $current_strand_direction_calculation == "forward" ]] || ([[ $current_strand_direction_calculation == "unknown" ]] && (( $(echo $current_maxentscan_forward' > '$current_maxentscan_reverse | bc -l) ))); then
# If this is the first iteration where a max MaxEntScan hasn't been set yet or the current forward value is larger than the current maximum
if [[ $max_maxentscan_5 == "" ]] || (( $(echo $current_maxentscan_forward' > '$max_maxentscan_5 | bc -l) )); then
max_maxentscan_5=$current_maxentscan_forward
max_maxentscan_sequence_5=$current_splice_site_forward
max_maxentscan_coordinates_5=$current_coordinate_range
max_maxentscan_type_5="forward"
fi
else
# If this is the first iteration where a max MaxEntScan hasn't been set yet or the current reverse value is larger than the current maximum
if [[ $max_maxentscan_5 == "" ]] || (( $(echo $current_maxentscan_reverse' > '$max_maxentscan_5 | bc -l) )); then
max_maxentscan_5=$current_maxentscan_reverse
max_maxentscan_sequence_5=$current_splice_site_reverse
max_maxentscan_coordinates_5=$current_coordinate_range
max_maxentscan_type_5="reverse"
fi
fi
done
# Fetch the reference genome sequence for the window coordinates
max_maxentscan_sequence_5_reference=$(samtools faidx $reference_genome $max_maxentscan_coordinates_5 | grep -v '^>')
# Adjust the reference genome sequence if the most damaging putative splice site is on the reverse strand
if [[ $max_maxentscan_type_5 == "reverse" ]]; then
max_maxentscan_sequence_5_reference=$(echo $max_maxentscan_sequence_5_reference | tr "[ATCG]" "[TAGC]" | rev)
fi
# Calculate the reference genome score
max_maxentscan_5_reference=$(echo $max_maxentscan_sequence_5_reference | perl MaxEntScan/score5.pl - | cut -f 2)
# Calculate the absolute difference between the reference and variant scores (i.e. take negative numbers into account)
max_maxentscan_difference_5=$(echo $max_maxentscan_5" - "$max_maxentscan_5_reference | bc -l | sed 's/-//') # Force the difference to be positive so it's an absolute difference
# Determine the potential MaxEntScan consequence (i.e. likelihood of a new splice site to replace the canonical one)
# If the score is below the reference score or below zero
if (( $(echo $max_maxentscan_5' <= '$max_maxentscan_5_reference | bc -l) )) || (( $(echo $max_maxentscan_5' < 0' | bc -l) )); then
max_maxentscan_consequence_5='NONE'
# If the score is 0-4 and the difference to the reference score is >=4
elif (( $(echo $max_maxentscan_5' < 4' | bc -l) )) && (( $(echo $max_maxentscan_difference_5' >= 4' | bc -l) )); then
max_maxentscan_consequence_5='LOW'
# If the score is 4-10 and the difference to the reference score is >=4
elif (( $(echo $max_maxentscan_5' < 10' | bc -l) )) && (( $(echo $max_maxentscan_difference_5' >= 4' | bc -l) )); then
max_maxentscan_consequence_5='MED'
# If the score is greater than 10 and the difference to the reference score is >= 6
elif (( $(echo $max_maxentscan_5' >= 10' | bc -l) )) && (( $(echo $max_maxentscan_difference_5' >= 6' | bc -l) )); then
max_maxentscan_consequence_5='HIGH'
fi
##################################
# 3' MaxEntScan score calculation
max_maxentscan_3=''
max_maxentscan_sequence_3=''
max_maxentscan_coordinates_3=''
max_maxentscan_difference_3=''
max_maxentscan_consequence_3=''
max_maxentscan_type_3=''
# $i is the offset for determining the coordinate range
for i in {0..22}; do
# Determine the putative splice site coordinate range
current_coordinate_range="$current_chr:"$(( $current_pos - 22 + $i ))"-"$(( $current_pos + $i ))
# Extract the current putative splice site sequence as in the reference genome and mutate the correct base corresponding to the variant
current_splice_site_forward=${current_max_reference_context:(( 0 + $i )):(( 22 - $i ))}$current_alt${current_max_reference_context:23:$i}
# If the variant gene is not on the reverse strand, calculate the MaxEntScan value for the splice site being on the forward strand
if [[ $current_strand_direction_calculation != "reverse" ]]; then
current_maxentscan_forward=$(echo $current_splice_site_forward | perl MaxEntScan/score3.pl - | cut -f 2)
fi
# If the variant gene is not on the forward strand, calculate reverse values too
if [[ $current_strand_direction_calculation != "forward" ]]; then
# Determine the reverse complement putative splice site sequence
current_splice_site_reverse=$(echo $current_splice_site_forward | tr "[ATCG]" "[TAGC]" | rev)
# Calculate the MaxEntScan value for the splice site being on the reverse strand
current_maxentscan_reverse=$(echo $current_splice_site_reverse | perl MaxEntScan/score3.pl - | cut -f 2)
fi
# If the variant gene is on the forward strand, or the strand is unknown but the forward value is higher than the reverse value
if [[ $current_strand_direction_calculation == "forward" ]] || ([[ $current_strand_direction_calculation == "unknown" ]] && (( $(echo $current_maxentscan_forward' > '$current_maxentscan_reverse | bc -l) ))); then
# If this is the first iteration where a max MaxEntScan hasn't been set yet or the current forward value is larger than the current maximum
if [[ $max_maxentscan_3 == "" ]] || (( $(echo $current_maxentscan_forward' > '$max_maxentscan_3 | bc -l) )); then
max_maxentscan_3=$current_maxentscan_forward
max_maxentscan_sequence_3=$current_splice_site_forward
max_maxentscan_coordinates_3=$current_coordinate_range
max_maxentscan_type_3="forward"
fi
else
# If this is the first iteration where a max MaxEntScan hasn't been set yet or the current reverse value is larger than the current maximum
if [[ $max_maxentscan_3 == "" ]] || (( $(echo $current_maxentscan_reverse' > '$max_maxentscan_3 | bc -l) )); then
max_maxentscan_3=$current_maxentscan_reverse
max_maxentscan_sequence_3=$current_splice_site_reverse
max_maxentscan_coordinates_3=$current_coordinate_range
max_maxentscan_type_3="reverse"
fi
fi
done
# For the maximal MaxEntScan window sequence, calculate MaxEntScan for the reference sequence
max_maxentscan_sequence_3_reference=$(samtools faidx $reference_genome $max_maxentscan_coordinates_3 | grep -v '^>')
# Adjust the reference genome sequence if the most damaging putative splice site is on the reverse strand
if [[ $max_maxentscan_type_3 == "reverse" ]]; then
max_maxentscan_sequence_3_reference=$(echo $max_maxentscan_sequence_3_reference | tr "[ATCG]" "[TAGC]" | rev)
fi
# Calculate the reference genome score
max_maxentscan_3_reference=$(echo $max_maxentscan_sequence_3_reference | perl MaxEntScan/score3.pl - | cut -f 2)
# Calculate the absolute difference between the reference and variant scores (i.e. take negative numbers into account)
max_maxentscan_difference_3=$(echo $max_maxentscan_3" - "$max_maxentscan_3_reference | bc -l | sed 's/-//') # Force the difference to be positive so it's an absolute difference
# Determine the potential MaxEntScan consequence (i.e. likelihood of a new splice site to replace the canonical one)
# If the score is below the reference score or below zero
if (( $(echo $max_maxentscan_3' <= '$max_maxentscan_3_reference | bc -l) )) || (( $(echo $max_maxentscan_3' < 0' | bc -l) )); then
max_maxentscan_consequence_3='NONE'
# If the score is 0-4 and the difference to the reference score is >=4
elif (( $(echo $max_maxentscan_3' < 4' | bc -l) )) && (( $(echo $max_maxentscan_difference_3' >= 4' | bc -l) )); then
max_maxentscan_consequence_3='LOW'
# If the score is 4-10 and the difference to the reference score is >=4
elif (( $(echo $max_maxentscan_3' < 10' | bc -l) )) && (( $(echo $max_maxentscan_difference_3' >= 4' | bc -l) )); then
max_maxentscan_consequence_3='MED'
# If the score is greater than 10 and the difference to the reference score is >= 6
elif (( $(echo $max_maxentscan_3' >= 10' | bc -l) )) && (( $(echo $max_maxentscan_difference_3' >= 6' | bc -l) )); then
max_maxentscan_consequence_3='HIGH'
fi
##################################
# Determine the maximum MaxEntScan consequence
max_maxentscan_consequence=''
if [[ $max_maxentscan_consequence_5 == 'HIGH' ]] || [[ $max_maxentscan_consequence_3 == 'HIGH' ]]; then
max_maxentscan_consequence='HIGH'
elif [[ $max_maxentscan_consequence_5 == 'MED' ]] || [[ $max_maxentscan_consequence_3 == 'MED' ]]; then
max_maxentscan_consequence='MED'
elif [[ $max_maxentscan_consequence_5 == 'LOW' ]] || [[ $max_maxentscan_consequence_3 == 'LOW' ]]; then
max_maxentscan_consequence='LOW'
else
max_maxentscan_consequence='NONE'
fi
##################################
# Scatter the output variants to separate temporary files
# Create an empty temporary file
tmpfile=$(mktemp /tmp/$prefix.introme.annotated.tsv.XXXXX)
# Output the result to the temporary file
echo "$line"$'\t'"$max_maxentscan_consequence"$'\t'"$max_maxentscan_5"$'\t'"$max_maxentscan_5_reference"$'\t'"$max_maxentscan_3"$'\t'"$max_maxentscan_3_reference"$'\t'"$max_maxentscan_type_5"$'\t'"$max_maxentscan_type_3"$'\t'"$max_maxentscan_coordinates_5"$'\t'"$max_maxentscan_coordinates_3"$'\t'"$max_maxentscan_sequence_5"$'\t'"$max_maxentscan_sequence_5_reference"$'\t'"$max_maxentscan_sequence_3"$'\t'"$max_maxentscan_sequence_3_reference" >> $tmpfile
} &
done
# Wait a few seconds for all threads to finish
sleep 30
# Gather all temporary files into the results file
for file in /tmp/$prefix.introme.annotated.tsv.*; do
cat "$file"
rm "$file"
done >> $out_dir/$prefix.introme.annotated.tsv
echo $(date +%x_%r) 'MaxEntScan calculation complete'
# Sort by chromosome and coordinate
sort -k1,1n -k2,2n $out_dir/$prefix.introme.annotated.tsv > $out_dir/$prefix.introme.annotated.sorted.tsv
mv $out_dir/$prefix.introme.annotated.sorted.tsv $out_dir/$prefix.introme.annotated.tsv
##################################
exit