adding v0.3 tr annotations

ACEnglish · ACEnglish · commit b49b00f24c53 · 2022-10-19T14:25:09.000-05:00
diff --git a/codis/get_coverage_counts.py b/codis/get_coverage_counts.py
@@ -5,12 +5,12 @@
 
 
 #cols = ['HG002', "NA24385", "li:NA24385"]
-cols = [_.strip() for _ in open('males.txt', 'r')] # males
-looking_for = ['1,1,0,0', '0,0,1,1']
+#cols = [_.strip() for _ in open('males.txt', 'r')] # males
+#looking_for = ['1,1,0,0', '0,0,1,1']
 #v = (~d[cols].isin(looking_for)).sum()
 
-#cols = d.columns[4:]
-#looking_for = ['1,1,1,1']
+cols = d.columns[4:]
+looking_for = ['1,1,1,1']
 v = (~d[cols].isin(looking_for)).sum()
 # sample, inadequate counts
 print(v.value_counts())
diff --git a/regions/.gitignore b/regions/.gitignore
@@ -20,3 +20,6 @@ data/tr_annotated.jl
 data/tr_regions.fasta.fai
 data/trf_annos_df.jl
 data/tr_regions.bed.gz
+data/usc/
+data/pbsv/
+data/trgt/
diff --git a/regions/DataDescription.md b/regions/DataDescription.md
@@ -1,15 +1,17 @@
 # Versions:
 
 ## v0.3 - More Regions
+(Click the badge to go to the download page)  
+[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.7226352.svg)](https://doi.org/10.5281/zenodo.7226352)
 
 ### CHANGES:
 * Added new annotations sources from:
   * [TRGT](https://github.com/PacificBiosciences/trgt/tree/main/repeats) - Both full regions and pathogenic
   * [pbsv](https://github.com/PacificBiosciences/pbsv/blob/master/annotations/human_GRCh38_no_alt_analysis_set.trf.bed)
   * [Vamos](https://zenodo.org/record/7155334/)
-* Same File structure as v0.2..?
+* See [slides](https://github.com/ACEnglish/adotto/blob/main/slides/GIABTR_English_October172022.pdf) for details
+* Same file structure as v0.2
 
-KnownPathogenic name - Gene Name (could be non) Gene feature (intron/exon/etc)
 
 ## v0.2 - Useable version
 (Click the badge to go to the download page)  
diff --git a/regions/intersection/README.md b/regions/intersection/README.md
@@ -20,44 +20,49 @@ Note, to prevent large SVs which span regions from altering our counts, the vari
 within the region's boundaries.
 
 This creates two files:
-- `counts_variants_to_regions.txt` - input regions.bed entry annotated with number of variants and number of variant bases
-- `filtered_variants_to_regions.txt` - the counts file filtered to only regions containing any non-SNP variants
+- `counts_<output>` - input region entries annotated with number of variants and number of variant bases
+- `filtered_<output>` - the counts file filtered to only regions containing non-SNP variants
 
 And reports:
 ```
 		v0.1		v0.3-dev
 statistic       count   percent	count   percent
 total regions   2232565 1	2170271 1
 no variant      448124  0.2007	431781  0.1990
-only a SNP      372144  0.1667	112869  0.0520
-only SNPs       474209  0.2124	163636  0.0754
-remaining       938088  0.4202	1461985 0.6736
+only a SNP      372144  0.1667	242294  0.1116
+only SNPs       474209  0.2124	135780  0.0626
+remaining       938088  0.4202	1360416 0.6268
 ```
 
 Let's repeat this with the annotations we made previously
 ```
 		v0.1		v0.3-dev
-statistic       count   percent
-total regions   3298925 1
-no variant      1600118 0.4850
-only a SNP      505514  0.1532
-only SNPs       389598  0.1181
-remaining       803695  0.2436
+statistic       count   percent	count   percent
+total regions   3298925 1	3503876 1
+no variant      1600118 0.4850	1716435 0.4899
+only a SNP      505514  0.1532	332505  0.0949
+only SNPs       389598  0.1181	160201  0.0457
+remaining       803695  0.2436	1294735 0.3695
 ```
 
 And again with the unannotated regions
 ```
 		v0.1		v0.3-dev
-statistic       count   percent
-total regions   439538  1
-no variant      128123  0.2915
-only a SNP      102119  0.2323
-only SNPs       126488  0.2878
-remaining       82808   0.1884
+statistic       count   percent	count   percent
+total regions   439538  1	428642  1
+no variant      128123  0.2915	126221  0.2945
+only a SNP      102119  0.2323	61672   0.1439
+only SNPs       126488  0.2878	28007   0.0653
+remaining       82808   0.1884	212742  0.4963
 ```
 
 So it's interesting (promising) that our unannotated regions less frequently contain variants.
 
+v0.3-dev ... We have a lot more regions 'remaining' in the unannotated. I gotta figure out what's happening here.
+
+1. Adding these new regions (namely pbsv, trgt, and usc are expanding the boundaries. 
+Collect these stats for the first slide... Actually hold off at this point.
+
 Question 2:
 ===========
 Of the candidate regions with variation, what percent of the variants by count and bases effected are contained
@@ -99,17 +104,17 @@ Question 3
 Can we find expansions/contractions of the tr_annotations inside the variants?
 
 The `filtered_variants_to_regions.txt` is now our new version of the tr_regions.bed. We'll use that to repeat the
-'Defining Repeats' steps described in `../README.md`
-
-
-```bash
-samtools faidx -r <(zcat tr_regions.bed.gz | awk '{print $1 ":" $2 "-" $3}')
-~/scratch/insertion_ref/msru/data/reference/grch38/GRCh38_1kg_mainchrs.fa > tr_regions.fasta
-```
-
+'Defining Repeats' steps described in `../README.md` 
 Then run TRF on the reference sequence of regions:
+
 ```bash
-trf409.linux64 data/tr_regions.fasta 3 7 7 80 5 5 500 -h -ngs > data/grch38.tandemrepeatfinder.txt
+samtools faidx -r <(cat filtered_variants_to_regions.txt | awk '{print $1 ":" $2 "-" $3}') \
+    ~/scratch/insertion_ref/msru/data/reference/grch38/GRCh38_1kg_mainchrs.fa > tr_regions.fasta
+trf409.linux64 tr_regions.fasta 3 7 7 80 5 5 500 -h -ngs > grch38.tandemrepeatfinder.txt
+python ../scripts/trf_reformatter.py grch38.tandemrepeatfinder.txt final_something
+bedtools sort -i final_something.bed | bgzip > final_something.bed.gz
+tabix final_something.bed.gz
+python ../scripts/tr_reganno_maker.py filtered_variants_to_regions.txt final_something.bed.gz > candidate_v0.3_anno.bed
 ```
 
 Because we're going to be using the variants to filter these repeat annotations, we lower the min-score to 5 from 40
diff --git a/regions/intersection/non_overlapping_annos_per_region.py b/regions/intersection/non_overlapping_annos_per_region.py
@@ -0,0 +1,21 @@
+import sys
+from truvari.annotations.trf import iter_tr_regions
+from intervaltree import IntervalTree
+
+def tree_and_merge(line):
+    
+    m_tree = IntervalTree()
+    for i in line['annos']:
+        m_tree.addi(i['start'], i['end'])
+    line['regcnt'] = len(m_tree)
+    m_tree.merge_overlaps()
+    line['mrgcnt'] = len(m_tree)
+    return line
+
+parts = []
+for entry in iter_tr_regions(sys.argv[1]):
+    entry = tree_and_merge(entry)
+    parts.append(entry)
+    #entry['mrgcnt'])
+import joblib
+joblib.dump(parts, 'tr_regmrgcnts.jl')
diff --git a/regions/intersection/variant_region_intersection.py b/regions/intersection/variant_region_intersection.py
@@ -33,17 +33,21 @@ def main(in_bed, in_vcf, out_name):
             start = int(start)
             end = int(end)
             cnt = 0
-            bases = 0
+            snps = 0
+            non_snp = 0
             for i in variants.fetch(chrom, int(start), int(end)):
                 # check only svs.. take this out for core analysis but keep in for extra analysis
                 #if 'SVLEN' not in i.info or i.info["SVLEN"] < 50:
                     #continue
                 vs, ve = truvari.entry_boundaries(i)
                 if start <= vs and ve <= end:
                     cnt += 1
-                    bases += truvari.entry_size(i)
-            fout.write(f"{line}\t{cnt}\t{bases}\n")
-    data = pd.read_csv(f"counts_{out_name}", sep='\t', header=None, names=['chrom', 'start', 'end', 'num_vars', 'num_bases'])
+                    if truvari.entry_variant_type(i) != truvari.SV.SNP:
+                        non_snp += 1
+                    else:
+                        snps += 1
+            fout.write(f"{line}\t{cnt}\t{snps}\t{non_snp}\n")
+    data = pd.read_csv(f"counts_{out_name}", sep='\t', header=None, names=['chrom', 'start', 'end', 'num_vars', 'snps', 'non_snp'])
     print("statistic\tcount\tpercent")
 
     tot = len(data)
@@ -53,11 +57,11 @@ def main(in_bed, in_vcf, out_name):
     i = no_var.sum()
     print("no variant\t%d\t%.4f" % (i, i / tot))
 
-    single_snp = (data['num_vars'] == 1) & (data['num_bases'] == 1)
+    single_snp = (data['snps'] == 1) & (data['non_snp'] == 0)
     i = single_snp.sum()
     print("only a SNP\t%d\t%.4f" % (i, i / tot))
 
-    only_snps = (data['num_vars'] > 1) & (data['num_vars'] == data['num_bases'])
+    only_snps = (data['snps'] > 1) & (data['non_snp'] == 0)
     i = only_snps.sum()
     print("only SNPs\t%d\t%.4f" % (i, i / tot))
 
diff --git a/regions/scripts/intersection_counter.sh b/regions/scripts/intersection_counter.sh
@@ -0,0 +1,23 @@
+echo v0.1
+echo pbsv
+bedtools intersect -u -a data/pbsv/merged.bed.gz -b delme_v0.1/data/tr_regions.bed.gz | wc -l
+echo usc
+bedtools intersect -u -a data/usc/merged.bed.gz -b delme_v0.1/data/tr_regions.bed.gz | wc -l
+echo trgt
+bedtools intersect -u -a data/trgt/merged.bed.gz -b delme_v0.1/data/tr_regions.bed.gz | wc -l
+
+echo v0.2
+echo pbsv
+bedtools intersect -u -a data/pbsv/merged.bed.gz -b adotto_TRannotations_v0.2.bed.gz | wc -l
+echo usc
+bedtools intersect -u -a data/usc/merged.bed.gz -b adotto_TRannotations_v0.2.bed.gz | wc -l
+echo trgt
+bedtools intersect -u -a data/trgt/merged.bed.gz -b adotto_TRannotations_v0.2.bed.gz | wc -l
+
+echo v0.3
+echo pbsv
+bedtools intersect -u -a data/pbsv/merged.bed.gz -b adotto_TRannotations_v0.3.bed.gz | wc -l
+echo usc
+bedtools intersect -u -a data/usc/merged.bed.gz -b adotto_TRannotations_v0.3.bed.gz | wc -l
+echo trgt
+bedtools intersect -u -a data/trgt/merged.bed.gz -b adotto_TRannotations_v0.3.bed.gz | wc -l
diff --git a/variants/.gitignore b/variants/.gitignore
@@ -1 +1,15 @@
 data/adotto_variants.grch38.sqoff.vcf.gz
+benchmarking/cmrg_comb/
+benchmarking/cmrg_region_phab/
+benchmarking/cmrg_svsmall/
+benchmarking/delme.txt
+benchmarking/no_bed/
+benchmarking/phab.raw.jl
+benchmarking/redos.bed
+benchmarking/redos/
+benchmarking/test/
+benchmarking/trash/
+benchmarking/x.sh
+data/adotto_variants.grch38.sqoff.vcf.gz.tbi
+data/delme/
+x.sh