Skip to content

Commit ece92ad

Browse files
committed
catching up
1 parent 2d0e8b5 commit ece92ad

File tree

8 files changed

+72
-38
lines changed

8 files changed

+72
-38
lines changed

realignment/scripts/get_reference.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,15 @@
44

55

66
fasta = pysam.FastaFile(sys.argv[1])
7-
chrom, rest = sys.argv[2].split(':')
8-
start, end = rest.split('-')
9-
start = int(start)
10-
end = int(end)
7+
if ':' in sys.argv[2]:
8+
chrom, rest = sys.argv[2].split(':')
9+
start, end = rest.split('-')
10+
start = int(start)
11+
end = int(end)
12+
else:
13+
chrom = sys.argv[2]
14+
start = 1
15+
end = fasta.get_reference_length(chrom)
1116

1217
oseq = fasta.fetch(chrom, start - 1, end)
1318
oseq = re.sub("(.{60})", "\\1\n", oseq, 0, re.DOTALL)

realignment/scripts/msa_realign_pipe.sh

+19-15
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,14 @@ echo "MSA realignment of region" $region
1212
mkdir -p msa_$region
1313

1414
#bcftools norm -m - -c s -f $ref -r $region $vcf > msa_${region}/variants.vcf
15-
bcftools view -r $region -o msa_${region}/variants.vcf.gz -O z $vcf
15+
bcftools view -r $region $vcf \
16+
| bcftools +fill-from-fasta /dev/stdin -- -c REF -f $ref \
17+
| bgzip > msa_${region}/variants.vcf.gz
18+
1619
tabix msa_${region}/variants.vcf.gz
1720

1821
python $DIR/get_reference.py $ref $region > msa_${region}/haps.fa
19-
for i in $(zgrep -m1 '#CHROM' $vcf | cut -f10-)
22+
for i in $(bcftools view -h $vcf | grep -m1 '#CHROM' | cut -f10-)
2023
do
2124
samtools faidx $ref $region | bcftools consensus -H1 --sample $i $vcf | python $DIR/fa_rename.py ${i}_1 >> msa_${region}/haps.fa
2225
samtools faidx $ref $region | bcftools consensus -H2 --sample $i $vcf | python $DIR/fa_rename.py ${i}_2 >> msa_${region}/haps.fa
@@ -25,7 +28,7 @@ python $DIR/remove_redundant.py msa_${region}/haps.fa > msa_${region}/haps_nored
2528

2629

2730
#/users/u233287/scratch/misc_software/mafft-linux64/mafft.bat --auto msa_${region}/haps_noredund.txt > msa_${region}/aln_results.txt
28-
/users/u233287/scratch/misc_software/mafft-linux64/mafft.bat --auto msa_${region}/haps.fa > msa_${region}/aln_results.txt
31+
/users/u233287/scratch/misc_software/mafft-linux64/mafft.bat --retree 2 --maxiterate 0 msa_${region}/haps.fa > msa_${region}/aln_results.txt
2932
#/users/u233287/scratch/misc_software/mafft-linux64/mafft.bat --globalpair --maxiterate 1000 msa_${region}/haps_noredund.txt > msa_${region}/aln_results.txt
3033

3134
#./ProGraphMSA+TR.sh -o result_${region}.txt -R haps_noredund_${region}.txt
@@ -42,17 +45,18 @@ bash $DIR/mend_report.sh msa_${region}/variants.vcf.gz >> msa_${region}/report.t
4245
echo "Realigned" >> msa_${region}/report.txt
4346
bash $DIR/mend_report.sh msa_${region}/result.vcf.gz >> msa_${region}/report.txt
4447

45-
python $DIR/get_reference.py $ref $region > msa_${region}/haps_final.fa
46-
for i in $(zgrep -m1 '#CHROM' $vcf | cut -f10-)
47-
do
48-
samtools faidx $ref $region \
49-
| bcftools consensus -H1 --sample $i msa_${region}/result.vcf.gz \
50-
| python $DIR/fa_rename.py ${i}_1 >> msa_${region}/haps_final.fa
51-
samtools faidx $ref $region \
52-
| bcftools consensus -H2 --sample $i msa_${region}/result.vcf.gz \
53-
| python $DIR/fa_rename.py ${i}_2 >> msa_${region}/haps_final.fa
54-
done
55-
56-
echo "md5sums" $(sort msa_${region}/haps.fa | md5sum) $(sort msa_${region}/haps_final.fa | md5sum)
48+
# Turning off validation checking for now
49+
#python $DIR/get_reference.py $ref $region > msa_${region}/haps_final.fa
50+
#for i in $(zgrep -m1 '#CHROM' $vcf | cut -f10-)
51+
#do
52+
#samtools faidx $ref $region \
53+
#| bcftools consensus -H1 --sample $i msa_${region}/result.vcf.gz \
54+
#| python $DIR/fa_rename.py ${i}_1 >> msa_${region}/haps_final.fa
55+
#samtools faidx $ref $region \
56+
#| bcftools consensus -H2 --sample $i msa_${region}/result.vcf.gz \
57+
#| python $DIR/fa_rename.py ${i}_2 >> msa_${region}/haps_final.fa
58+
#done
59+
#
60+
#echo "md5sums" $(sort msa_${region}/haps.fa | md5sum) $(sort msa_${region}/haps_final.fa | md5sum)
5761

5862

regions/intersection/README.md

+24
Original file line numberDiff line numberDiff line change
@@ -89,3 +89,27 @@ So, 3.81% of the genome which the v0.2 TR regions cover contains
8989
- 45.2% of all variants by bases effected
9090
- 75.5% of SVs by count
9191
- 47.0% of SVs by bases effected
92+
93+
94+
Question 3
95+
==========
96+
Can we find expansions/contractions of the tr_annotations inside the variants?
97+
98+
The `filtered_variants_to_regions.txt` is now our new version of the tr_regions.bed. We'll use that to repeat the
99+
'Defining Repeats' steps described in `../README.md`
100+
101+
102+
```bash
103+
samtools faidx -r <(zcat tr_regions.bed.gz | awk '{print $1 ":" $2 "-" $3}')
104+
~/scratch/insertion_ref/msru/data/reference/grch38/GRCh38_1kg_mainchrs.fa > tr_regions.fasta
105+
trf409.linux64 data/tr_regions.fasta 3 7 7 80 5 40 500 -h -ngs > data/grch38.tandemrepeatfinder.txt
106+
```
107+
108+
Then run TRF on the reference sequence of regions:
109+
```bash
110+
trf409.linux64 data/tr_regions.fasta 3 7 7 80 5 5 500 -h -ngs > data/grch38.tandemrepeatfinder.txt
111+
```
112+
113+
Because we're going to be using the variants to filter these repeat annotations, we lower the min-score from 5 to 40
114+
with the idea being we're more interested in sensitivity.
115+

regions/intersection/variant_region_intersection.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -28,20 +28,21 @@ def main(in_bed, in_vcf, out_name):
2828
variants = pysam.VariantFile(in_vcf)
2929
with open(f"counts_{out_name}", 'w') as fout:
3030
for line in optional_compressed_fh(in_bed):
31-
chrom, start, end = line.strip().split('\t')[:3]
31+
line = line.strip()
32+
chrom, start, end = line.split('\t')[:3]
3233
start = int(start)
3334
end = int(end)
3435
cnt = 0
3536
bases = 0
3637
for i in variants.fetch(chrom, int(start), int(end)):
3738
# check only svs.. take this out for core analysis but keep in for extra analysis
38-
if 'SVLEN' not in i.info or i.info["SVLEN"] < 50:
39-
continue
39+
#if 'SVLEN' not in i.info or i.info["SVLEN"] < 50:
40+
#continue
4041
vs, ve = truvari.entry_boundaries(i)
4142
if start <= vs and ve <= end:
4243
cnt += 1
4344
bases += truvari.entry_size(i)
44-
fout.write(f"{chrom}\t{start}\t{end}\t{cnt}\t{bases}\n")
45+
fout.write(f"{line}\t{cnt}\t{bases}\n")
4546
data = pd.read_csv(f"counts_{out_name}", sep='\t', header=None, names=['chrom', 'start', 'end', 'num_vars', 'num_bases'])
4647
print("statistic\tcount\tpercent")
4748

variants/README.md

+3-2
Original file line numberDiff line numberDiff line change
@@ -106,11 +106,12 @@ First, we make the header
106106
bcftools merge -m none hapo_merged/*.vcf.gz -o pVCFs/GRCh38.variants.header.vcf --print-header --force-samples
107107
```
108108
Our run had redundant sample names which were manually altered inside of the output header e.g. NA24385 is assembled by
109-
both eichler and li, therefor the li was renamed in the new header li:NA24385.
109+
both eichler and li, therefore the li was renamed in the new header li:NA24385.
110110

111111
Next, we merge the vcfs
112112
```bash
113-
bcftools merge -m none hapo_merged/*.vcf.gz --use-header pVCFs/GRCh38.variants.header.vcf | bcftools view -S pVCFs/sample_order.txt -o pVCFs/GRCh38.variants.vcf.gz -O z
113+
bcftools merge -m none hapo_merged/*.vcf.gz --use-header pVCFs/GRCh38.variants.header.vcf \
114+
| bcftools view -S pVCFs/sample_order.txt -o pVCFs/GRCh38.variants.vcf.gz -O z
114115
```
115116
The sample_order.txt is just the list of the sample names so that we can control their order in the pVCF.
116117

variants/scripts/annotate_pvcf_cov.py

-4
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,6 @@
4141
if sample + '.1' not in dn_pos_coverage:
4242
logging.warning("missing dn %s at %s:%d.1", sample, entry.chrom, entry.start)
4343

44-
if entry.samples[sample]["FT"] != '.':
45-
# Trust the earlier step got the coverage right
46-
continue
47-
4844
u_cov1 = int(up_pos_coverage[sample + '.0'])
4945
u_cov2 = int(up_pos_coverage[sample + '.1'])
5046
d_cov1 = int(dn_pos_coverage[sample + '.0'])

variants/scripts/map_haplo.sh

+8-7
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,19 @@
22

33

44
# Input fasta haplotype
5-
fasta=$1
5+
fasta=$(realpath $1)
66
# Reference
7-
ref=$2
7+
ref=$(realpath $2)
88
sample_name=$3
99
# Directory to write the output folder
10-
out_dir=$4
10+
out_dir=$(realpath $4)
1111

1212
# These would need to be a parameter / config
1313
params='-cx asm20 -m 10000 -z 10000,50 -r 50000,2000000 --end-bonus=100 --rmq=yes -O 5,56 -E 4,1 -B'
1414
threads=8
1515
# minimum quality score of alignments to consider
1616
min_qual=60
1717

18-
1918
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
2019
if ! command -v minimap2 &> /dev/null
2120
then
@@ -25,18 +24,20 @@ fi
2524

2625
if ! command -v paftools.js &> /dev/null
2726
then
28-
echo "Program 'minimap2' not found in environment"
27+
echo "Program 'paftools.js' not found in environment"
2928
exit 1
3029
fi
3130

32-
3331
mkdir -p ${out_dir}
3432
cd ${out_dir}
3533
anno=$DIR/annotate_cov.py
3634
minimap2 ${params} -t${threads} --secondary=no --cs ${ref} ${fasta} \
3735
| sort -k6,6 -k8,8n > aln.paf
3836
paftools.js stat aln.paf > aln.paf.stats.txt
39-
cat aln.paf | paftools.js call -f ${ref} -q ${min_qual} -L10000 - -s ${sample_name} | vcf-sort | bgzip > aln.vcf.gz
37+
cat aln.paf | paftools.js call -f ${ref} -q ${min_qual} -L10000 -s ${sample_name} - \
38+
| vcf-sort \
39+
| bcftools +fill-from-fasta /dev/stdin -- -c REF -f ${ref} \
40+
| bgzip > aln.vcf.gz
4041
tabix aln.vcf.gz
4142

4243
awk -v mq=${min_qual} '{if ($12 >= mq) print $6 "\t" $8 "\t" $9}' aln.paf > aln.bed

variants/scripts/mk_mergehaps.sh

+4-2
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
1+
pairs=$1
12
mkdir -p jobs
23
mkdir -p logs
34

4-
cat metadata/hap_pairs.txt | while read h1 c1 h2 c2 proj samp
5+
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
6+
cat $pairs | while read h1 c1 h2 c2 proj samp
57
do
68
echo "#!/bin/bash" > jobs/hapmerge_${proj}_${samp}.sh
7-
echo "bash scripts/merge_haps.sh $h1 $c1 $h2 $c2 $samp hapo_merged/${proj}_${samp}.vcf.gz" >> jobs/hapmerge_${proj}_${samp}.sh
9+
echo "bash $DIR/merge_haps.sh $h1 $c1 $h2 $c2 $samp hapo_merged/${proj}_${samp}.vcf.gz" >> jobs/hapmerge_${proj}_${samp}.sh
810
done
911

1012

0 commit comments

Comments
 (0)