diff --git a/STUDY_CONFIGS/chdm_phs001643_2018_data_processing_config.json b/STUDY_CONFIGS/chdm_phs001643_2018_data_processing_config.json index ab6b10f..0a65282 100644 --- a/STUDY_CONFIGS/chdm_phs001643_2018_data_processing_config.json +++ b/STUDY_CONFIGS/chdm_phs001643_2018_data_processing_config.json @@ -2,8 +2,8 @@ "bedtools": "bedtools", "cp_only_script": "/home/ubuntu/tools/kf-cbioportal-etl/scripts/get_cbio_copy_only_num.pl", "bed_genes": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/Homo_sapiens.GRCh38.105.chr.gtf_genes.bed", - "hugo_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/HUGO_EntrezID.tsv", - "entrez_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/EntrezGeneId_HugoGeneSymbol.tsv", + "hugo_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/HUGO_2021-06-01_EntrezID.tsv", + "entrez_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/EntrezGeneId_HugoGeneSymbol_2021-06-01.txt", "rna_ext_list": { "expression": "rsem.genes.results.gz", "fusion": "annoFuse_filter.tsv" diff --git a/STUDY_CONFIGS/chdm_sd_7spqtt8m_data_processing_config.json b/STUDY_CONFIGS/chdm_sd_7spqtt8m_data_processing_config.json index 4417f7e..9d86bce 100644 --- a/STUDY_CONFIGS/chdm_sd_7spqtt8m_data_processing_config.json +++ b/STUDY_CONFIGS/chdm_sd_7spqtt8m_data_processing_config.json @@ -2,8 +2,8 @@ "bedtools": "bedtools", "cp_only_script": "/home/ubuntu/tools/kf-cbioportal-etl/scripts/get_cbio_copy_only_num.pl", "bed_genes": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/Homo_sapiens.GRCh38.105.chr.gtf_genes.bed", - "hugo_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/HUGO_EntrezID.tsv", - "entrez_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/EntrezGeneId_HugoGeneSymbol.tsv", + "hugo_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/HUGO_2021-06-01_EntrezID.tsv", + "entrez_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/EntrezGeneId_HugoGeneSymbol_2021-06-01.txt", "rna_ext_list": { "expression": "rsem.genes.results.gz", "fusion": "annoFuse_filter.tsv" diff --git a/STUDY_CONFIGS/open_chordoma_data_processing_config.json b/STUDY_CONFIGS/open_chordoma_data_processing_config.json index 1cf4b12..38d57de 100644 --- a/STUDY_CONFIGS/open_chordoma_data_processing_config.json +++ b/STUDY_CONFIGS/open_chordoma_data_processing_config.json @@ -2,8 +2,8 @@ "bedtools": "bedtools", "cp_only_script": "/home/ubuntu/tools/kf-cbioportal-etl/scripts/get_cbio_copy_only_num.pl", "bed_genes": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/Homo_sapiens.GRCh38.105.chr.gtf_genes.bed", - "hugo_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/HUGO_EntrezID.tsv", - "entrez_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/EntrezGeneId_HugoGeneSymbol.tsv", + "hugo_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/HUGO_2021-06-01_EntrezID.tsv", + "entrez_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/EntrezGeneId_HugoGeneSymbol_2021-06-01.txt", "rna_ext_list": { "expression": "rsem.genes.results.gz", "fusion": "annoFuse_filter.tsv" diff --git a/scripts/cnv_1_genome2gene.py b/scripts/cnv_1_genome2gene.py index 6945c95..d1f16c8 100755 --- a/scripts/cnv_1_genome2gene.py +++ b/scripts/cnv_1_genome2gene.py @@ -14,61 +14,41 @@ def process_cnv(cpath): cnv_cp_only = config_data["cp_only_script"] bed_file = config_data["bed_genes"] hugo_tsv = config_data["hugo_tsv"] - sys.stderr.write("Processing " + cpath + "\n") + print("Processing " + cpath, file=sys.stderr) root = os.path.basename(cpath) - limit = config_data["cnv_min_len"] - temp_len = root + ".CNVs_" + str(limit) + "_filtered.bed" - len_fh = open(out_dir + temp_len, "w") - in_cnv = open(cpath) - skip = next(in_cnv) - for cnv in in_cnv: - ct_dict["total_cnvs"] += 1 - cnv_data = cnv.rstrip("\n").split("\t") - if int(cnv_data[2]) - int(cnv_data[1]) >= limit: - len_fh.write("\t".join(cnv_data[0:5]) + "\n") - else: - ct_dict["short_cnvs"] += 1 - # sys.stderr.write("Entry too short " + cnv) - len_fh.close() - temp = root + ".CNVs.Genes" - to_genes_cmd = ( - bedtools - + " intersect -a " - + out_dir - + temp_len - + " -b " - + bed_file - + " -wb > " - + out_dir - + temp - ) + temp_filtered_fname = root + ".CNVs_0.05_filtered.bed" + with open(out_dir + temp_filtered_fname, "w") as temp_filtered: + with open(cpath) as in_cnv: + head = next(in_cnv) + header = head.rstrip('\n').split('\t') + wilcox_idx = header.index('WilcoxonRankSumTestPvalue') + ks_idx = header.index('KolmogorovSmirnovPvalue') + for cnv in in_cnv: + ct_dict["total_cnvs"] += 1 + cnv_data = cnv.rstrip("\n").split("\t") + if cnv_data[wilcox_idx] == "NA" or cnv_data[ks_idx] == "NA": + ct_dict["NA"] += 1 + elif float(cnv_data[wilcox_idx]) < 0.05 and float(cnv_data[ks_idx]) < 0.05: + print("\t".join(cnv_data[0:5]), file=temp_filtered) + else: + ct_dict["p_value_filter"] += 1 + temp_genes_fname = root + ".CNVs.Genes" + to_genes_cmd = "{} intersect -a {}{} -b {} -wb > {}{}".format(bedtools, out_dir, temp_filtered_fname, bed_file, out_dir, temp_genes_fname) subprocess.call(to_genes_cmd, shell=True) - out_gene_cnv_only = ( - "perl " - + cnv_cp_only - + " " - + hugo_tsv - + " " - + out_dir - + temp - + " > " - + out_dir - + temp - + ".copy_number" - ) + out_gene_cnv_only = "perl {} {} {}{} > {}{}.copy_number".format(cnv_cp_only, hugo_tsv, out_dir, temp_genes_fname, out_dir, temp_genes_fname) check = subprocess.call(out_gene_cnv_only, shell=True) if check: - sys.stderr.write('Error using to geens script. Check log\n') + print('Error using to genes script. Check log', file=sys.stderr) sys.exit(1) - rm_tmp = "rm " + out_dir + temp + " " + out_dir + temp_len + rm_tmp = "rm {}{} {}{}".format(out_dir, temp_genes_fname, out_dir, temp_filtered_fname) subprocess.call(rm_tmp, shell=True) except Exception as e: - sys.stderr.write(str(e)) + print("{}".format(e), file=sys.stderr) sys.exit(1) parser = argparse.ArgumentParser( - description="Convert controlFreeC cnv genome coords to gene level" + description="Convert ControlFreeC cnv genome coords to gene level and filter based on p value < 0.05" ) parser.add_argument( "-d", @@ -89,27 +69,17 @@ def process_cnv(cpath): with open(args.config_file) as f: config_data = json.load(f) -limit = config_data["cnv_min_len"] suffix = config_data["dna_ext_list"]["copy_number"] -sys.stderr.write("Getting cnv list\n") +print("Getting cnv list", file=sys.stderr) flist =[os.path.join(args.cnv_dir, f) for f in os.listdir(args.cnv_dir) if os.path.isfile(os.path.join(args.cnv_dir, f))] out_dir = "converted_cnvs/" try: os.mkdir(out_dir) except: - sys.stderr.write("output dir already exists\n") + print("output dir already exists", file=sys.stderr) -ct_dict = {"total_cnvs": 0, "short_cnvs": 0} +ct_dict = {"total_cnvs": 0, "p_value_filter": 0, "NA": 0} with concurrent.futures.ThreadPoolExecutor(config_data["cpus"]) as executor: - results = { - executor.submit(process_cnv, cpath): cpath - for cpath in flist - } -sys.stderr.write(str(ct_dict["total_cnvs"]) + " total cnv calls processed\n") -sys.stderr.write( - str(ct_dict["short_cnvs"]) - + " calls dropped for not meeting min bp size " - + str(limit) - + "\n" -) -sys.stderr.write("Done, check logs\n") + results = { executor.submit(process_cnv, cpath): cpath for cpath in flist } +print("{} total cnv calls processed".format(ct_dict["total_cnvs"]), file=sys.stderr) +print("{} calls dropped for not meeting p value cutoff of 0.05, {} calls dropped for having an NA p value".format(ct_dict["p_value_filter"], ct_dict["NA"]), file=sys.stderr ) diff --git a/scripts/get_cbio_copy_only_num.pl b/scripts/get_cbio_copy_only_num.pl index 679bc96..3fb3d2e 100755 --- a/scripts/get_cbio_copy_only_num.pl +++ b/scripts/get_cbio_copy_only_num.pl @@ -1,4 +1,6 @@ -#! usr/bin/env perl +#!/usr/bin/env perl +# Script to output a headerless TSV with hugo gene symbol entrez ID copy number for downstrema processing +# Positional args, file 1 is a Hugo entrez ID TSV, file 2 is output from cnv_1_genome2gene.py use strict; use warnings FATAL => 'all'; use Try::Tiny; @@ -8,6 +10,7 @@ my %hash; my %cbio; +# First file is hugo tsv, with gene symbols and entrez IDs open IN, $in; while(){ chomp; @@ -20,15 +23,18 @@ $hash{$a[0]} = 0; } } +# Second file is output of bedtools intersect of ControlFreeC p value file filtered by cnv_1_genome2gene.py and a bed file with coords and gene names open IN2, $in2; while(){ try{ chomp; my @b=split(/\t/,$_); + # CN value my $num=$b[3]; + # ensure gene name is in hugo file if (exists $hash{$b[-1]}){ + # hashes by gene name entrez id my $as="$b[-1]\t$hash{$b[-1]}"; - # print "$b[-1]\t$hash{$b[-1]}\t$num\n"; $cbio{$as}=$num; } } diff --git a/utilities/compare_cn_files.py b/utilities/compare_cn_files.py new file mode 100644 index 0000000..09e2ba9 --- /dev/null +++ b/utilities/compare_cn_files.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +""" +Likely a one-off script. Compare CN matrices from a previous load to a new one, summarize difference, and output number of hotspot changes +Usage: + python3 compare_cn_files.py old.tsv new.tsv hotspots.txt +""" + +import pandas as pd +import sys +import numpy as np + +old = pd.read_csv(sys.argv[1], sep="\t") +old.set_index("Hugo_Symbol", inplace=True) +old.sort_index(inplace=True) + +new = pd.read_csv(sys.argv[2], sep="\t") +new.set_index("Hugo_Symbol", inplace=True) +new.sort_index(inplace=True) + +ne_stacked = (old != new).stack() +changed = ne_stacked[ne_stacked] + +difference_locations = np.where(old != new) + +changed_from = old.values[difference_locations] + +changed_to = new.values[difference_locations] +result = pd.DataFrame({'from': changed_from, 'to': changed_to}, index=changed.index) + +with open(sys.argv[3]) as f: + hotspots = f.read().splitlines() + +print("HotSpot\tCount Added\tCount Dropped") +new_added = result[result['from'] == 0] +print("{} entries will be added".format(len(new_added)), file=sys.stderr) +new_rm = result[result['to'] == 0] +print("{} entries will be dropped".format(len(new_rm)), file=sys.stderr) +clean_list = [] +for gene in hotspots: + add = 0 + drop = 0 + try: + add = len(new_added.loc[[gene]]) + except Exception as e: + print(e, file=sys.stderr) + try: + drop = len(new_rm.loc[[gene]]) + except Exception as e: + print(e, file=sys.stderr) + if add or drop: + print("{}\t{}\t{}".format(gene, add, drop)) + clean_list.append(gene)