Merge pull request #67 from kids-first/feature/mb-adjust-cn-cutoff

adjust cn cutoff
kids-first · Sep 26, 2024 · 37568b6 · 37568b6
2 parents 6523cb3 + 238aec1
commit 37568b6
Show file tree

Hide file tree

Showing 6 changed files with 96 additions and 68 deletions.
diff --git a/STUDY_CONFIGS/chdm_phs001643_2018_data_processing_config.json b/STUDY_CONFIGS/chdm_phs001643_2018_data_processing_config.json
@@ -2,8 +2,8 @@
     "bedtools": "bedtools",
     "cp_only_script": "/home/ubuntu/tools/kf-cbioportal-etl/scripts/get_cbio_copy_only_num.pl",
     "bed_genes": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/Homo_sapiens.GRCh38.105.chr.gtf_genes.bed",
-    "hugo_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/HUGO_EntrezID.tsv",
-    "entrez_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/EntrezGeneId_HugoGeneSymbol.tsv",
+    "hugo_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/HUGO_2021-06-01_EntrezID.tsv",
+    "entrez_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/EntrezGeneId_HugoGeneSymbol_2021-06-01.txt",
     "rna_ext_list": {
       "expression": "rsem.genes.results.gz",
       "fusion": "annoFuse_filter.tsv"

diff --git a/STUDY_CONFIGS/chdm_sd_7spqtt8m_data_processing_config.json b/STUDY_CONFIGS/chdm_sd_7spqtt8m_data_processing_config.json
@@ -2,8 +2,8 @@
     "bedtools": "bedtools",
     "cp_only_script": "/home/ubuntu/tools/kf-cbioportal-etl/scripts/get_cbio_copy_only_num.pl",
     "bed_genes": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/Homo_sapiens.GRCh38.105.chr.gtf_genes.bed",
-    "hugo_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/HUGO_EntrezID.tsv",
-    "entrez_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/EntrezGeneId_HugoGeneSymbol.tsv",
+    "hugo_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/HUGO_2021-06-01_EntrezID.tsv",
+    "entrez_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/EntrezGeneId_HugoGeneSymbol_2021-06-01.txt",
     "rna_ext_list": {
       "expression": "rsem.genes.results.gz",
       "fusion": "annoFuse_filter.tsv"

diff --git a/STUDY_CONFIGS/open_chordoma_data_processing_config.json b/STUDY_CONFIGS/open_chordoma_data_processing_config.json
@@ -2,8 +2,8 @@
     "bedtools": "bedtools",
     "cp_only_script": "/home/ubuntu/tools/kf-cbioportal-etl/scripts/get_cbio_copy_only_num.pl",
     "bed_genes": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/Homo_sapiens.GRCh38.105.chr.gtf_genes.bed",
-    "hugo_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/HUGO_EntrezID.tsv",
-    "entrez_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/EntrezGeneId_HugoGeneSymbol.tsv",
+    "hugo_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/HUGO_2021-06-01_EntrezID.tsv",
+    "entrez_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/EntrezGeneId_HugoGeneSymbol_2021-06-01.txt",
     "rna_ext_list": {
       "expression": "rsem.genes.results.gz",
       "fusion": "annoFuse_filter.tsv"

diff --git a/scripts/cnv_1_genome2gene.py b/scripts/cnv_1_genome2gene.py
@@ -14,61 +14,41 @@ def process_cnv(cpath):
             cnv_cp_only = config_data["cp_only_script"]
             bed_file = config_data["bed_genes"]
             hugo_tsv = config_data["hugo_tsv"]
-            sys.stderr.write("Processing " + cpath + "\n")
+            print("Processing " + cpath, file=sys.stderr)
             root = os.path.basename(cpath)
-            limit = config_data["cnv_min_len"]
-            temp_len = root + ".CNVs_" + str(limit) + "_filtered.bed"
-            len_fh = open(out_dir + temp_len, "w")
-            in_cnv = open(cpath)
-            skip = next(in_cnv)
-            for cnv in in_cnv:
-                ct_dict["total_cnvs"] += 1
-                cnv_data = cnv.rstrip("\n").split("\t")
-                if int(cnv_data[2]) - int(cnv_data[1]) >= limit:
-                    len_fh.write("\t".join(cnv_data[0:5]) + "\n")
-                else:
-                    ct_dict["short_cnvs"] += 1
-                    # sys.stderr.write("Entry too short " + cnv)
-            len_fh.close()
-            temp = root + ".CNVs.Genes"
-            to_genes_cmd = (
-                bedtools
-                + " intersect -a "
-                + out_dir
-                + temp_len
-                + " -b "
-                + bed_file
-                + " -wb > "
-                + out_dir
-                + temp
-            )
+            temp_filtered_fname = root + ".CNVs_0.05_filtered.bed"
+            with open(out_dir + temp_filtered_fname, "w") as temp_filtered:
+                with open(cpath) as in_cnv:
+                    head = next(in_cnv)
+                    header = head.rstrip('\n').split('\t')
+                    wilcox_idx = header.index('WilcoxonRankSumTestPvalue')
+                    ks_idx = header.index('KolmogorovSmirnovPvalue')
+                    for cnv in in_cnv:
+                        ct_dict["total_cnvs"] += 1
+                        cnv_data = cnv.rstrip("\n").split("\t")
+                        if cnv_data[wilcox_idx] == "NA" or cnv_data[ks_idx] == "NA":
+                            ct_dict["NA"] += 1
+                        elif float(cnv_data[wilcox_idx]) < 0.05 and float(cnv_data[ks_idx]) < 0.05:
+                            print("\t".join(cnv_data[0:5]), file=temp_filtered)
+                        else:
+                            ct_dict["p_value_filter"] += 1
+            temp_genes_fname = root + ".CNVs.Genes"
+            to_genes_cmd = "{} intersect -a {}{} -b {} -wb > {}{}".format(bedtools, out_dir, temp_filtered_fname, bed_file, out_dir, temp_genes_fname)
             subprocess.call(to_genes_cmd, shell=True)
-            out_gene_cnv_only = (
-                "perl "
-                + cnv_cp_only
-                + " "
-                + hugo_tsv
-                + " "
-                + out_dir
-                + temp
-                + " > "
-                + out_dir
-                + temp
-                + ".copy_number"
-            )
+            out_gene_cnv_only = "perl {} {} {}{} > {}{}.copy_number".format(cnv_cp_only, hugo_tsv, out_dir, temp_genes_fname, out_dir, temp_genes_fname)
             check = subprocess.call(out_gene_cnv_only, shell=True)
             if check:
-                sys.stderr.write('Error using to geens script. Check log\n')
+                print('Error using to genes script. Check log', file=sys.stderr)
                 sys.exit(1)
-            rm_tmp = "rm " + out_dir + temp + " " + out_dir + temp_len
+            rm_tmp = "rm {}{} {}{}".format(out_dir, temp_genes_fname, out_dir, temp_filtered_fname)
             subprocess.call(rm_tmp, shell=True)
     except Exception as e:
-        sys.stderr.write(str(e))
+        print("{}".format(e), file=sys.stderr)
         sys.exit(1)
 
 
 parser = argparse.ArgumentParser(
-    description="Convert controlFreeC cnv genome coords to gene level"
+    description="Convert ControlFreeC cnv genome coords to gene level and filter based on p value < 0.05"
 )
 parser.add_argument(
     "-d",
@@ -89,27 +69,17 @@ def process_cnv(cpath):
 with open(args.config_file) as f:
     config_data = json.load(f)
 
-limit = config_data["cnv_min_len"]
 suffix = config_data["dna_ext_list"]["copy_number"]
-sys.stderr.write("Getting cnv list\n")
+print("Getting cnv list", file=sys.stderr)
 flist =[os.path.join(args.cnv_dir, f) for f in os.listdir(args.cnv_dir) if os.path.isfile(os.path.join(args.cnv_dir, f))]
 out_dir = "converted_cnvs/"
 try:
     os.mkdir(out_dir)
 except:
-    sys.stderr.write("output dir already exists\n")
+    print("output dir already exists", file=sys.stderr)
 
-ct_dict = {"total_cnvs": 0, "short_cnvs": 0}
+ct_dict = {"total_cnvs": 0, "p_value_filter": 0, "NA": 0}
 with concurrent.futures.ThreadPoolExecutor(config_data["cpus"]) as executor:
-    results = {
-        executor.submit(process_cnv, cpath): cpath
-        for cpath in flist
-    }
-sys.stderr.write(str(ct_dict["total_cnvs"]) + " total cnv calls processed\n")
-sys.stderr.write(
-    str(ct_dict["short_cnvs"])
-    + " calls dropped for not meeting min bp size "
-    + str(limit)
-    + "\n"
-)
-sys.stderr.write("Done, check logs\n")
+    results = { executor.submit(process_cnv, cpath): cpath for cpath in flist }
+print("{} total cnv calls processed".format(ct_dict["total_cnvs"]), file=sys.stderr)
+print("{} calls dropped for not meeting p value cutoff of 0.05, {} calls dropped for having an NA p value".format(ct_dict["p_value_filter"], ct_dict["NA"]), file=sys.stderr )
diff --git a/scripts/get_cbio_copy_only_num.pl b/scripts/get_cbio_copy_only_num.pl
@@ -1,4 +1,6 @@
-#! usr/bin/env perl
+#!/usr/bin/env perl
+# Script to output a headerless TSV with hugo gene symbol <tab> entrez ID <tab> copy number for downstrema processing
+# Positional args, file 1 is a Hugo <tab> entrez ID TSV, file 2 is output from cnv_1_genome2gene.py
 use strict;
 use warnings FATAL => 'all';
 use Try::Tiny;
@@ -8,6 +10,7 @@
 my %hash;
 my %cbio;
 
+# First file is hugo tsv, with gene symbols and entrez IDs
 open IN, $in;
 while(<IN>){
 	chomp;
@@ -20,15 +23,18 @@
 		$hash{$a[0]} = 0;
 	}
 }
+# Second file is output of bedtools intersect of ControlFreeC p value file filtered by cnv_1_genome2gene.py and a bed file with coords and gene names
 open IN2, $in2;
 while(<IN2>){
 	try{
 		chomp;
 		my @b=split(/\t/,$_);
+		# CN value
 		my $num=$b[3];
+		# ensure gene name is in hugo file
 		if (exists $hash{$b[-1]}){
+			# hashes by gene name <tab> entrez id 
 			my $as="$b[-1]\t$hash{$b[-1]}";
-	#			print "$b[-1]\t$hash{$b[-1]}\t$num\n";
 				$cbio{$as}=$num;
 				}
 		}

diff --git a/utilities/compare_cn_files.py b/utilities/compare_cn_files.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+"""
+Likely a one-off script. Compare CN matrices from a previous load to a new one, summarize difference, and output number of hotspot changes
+Usage:
+  python3 compare_cn_files.py old.tsv new.tsv hotspots.txt
+"""
+
+import pandas as pd
+import sys
+import numpy as np
+
+old = pd.read_csv(sys.argv[1], sep="\t")
+old.set_index("Hugo_Symbol", inplace=True)
+old.sort_index(inplace=True)
+
+new = pd.read_csv(sys.argv[2], sep="\t")
+new.set_index("Hugo_Symbol", inplace=True)
+new.sort_index(inplace=True)
+
+ne_stacked = (old != new).stack()
+changed = ne_stacked[ne_stacked]
+
+difference_locations = np.where(old != new)
+
+changed_from = old.values[difference_locations]
+
+changed_to = new.values[difference_locations]
+result =  pd.DataFrame({'from': changed_from, 'to': changed_to}, index=changed.index)
+
+with open(sys.argv[3]) as f:
+    hotspots = f.read().splitlines()
+
+print("HotSpot\tCount Added\tCount Dropped")
+new_added = result[result['from'] == 0]
+print("{} entries will be added".format(len(new_added)), file=sys.stderr)
+new_rm = result[result['to'] == 0]
+print("{} entries will be dropped".format(len(new_rm)), file=sys.stderr)
+clean_list = []
+for gene in hotspots:
+    add = 0
+    drop = 0
+    try:
+        add = len(new_added.loc[[gene]])
+    except Exception as e:
+        print(e, file=sys.stderr)
+    try:
+        drop = len(new_rm.loc[[gene]])
+    except Exception as e:
+        print(e, file=sys.stderr)
+    if add or drop:
+        print("{}\t{}\t{}".format(gene, add, drop))
+        clean_list.append(gene)