Skip to content

Commit

Permalink
Merge pull request #67 from kids-first/feature/mb-adjust-cn-cutoff
Browse files Browse the repository at this point in the history
 adjust cn cutoff
  • Loading branch information
migbro authored Sep 26, 2024
2 parents 6523cb3 + 238aec1 commit 37568b6
Show file tree
Hide file tree
Showing 6 changed files with 96 additions and 68 deletions.
4 changes: 2 additions & 2 deletions STUDY_CONFIGS/chdm_phs001643_2018_data_processing_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
"bedtools": "bedtools",
"cp_only_script": "/home/ubuntu/tools/kf-cbioportal-etl/scripts/get_cbio_copy_only_num.pl",
"bed_genes": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/Homo_sapiens.GRCh38.105.chr.gtf_genes.bed",
"hugo_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/HUGO_EntrezID.tsv",
"entrez_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/EntrezGeneId_HugoGeneSymbol.tsv",
"hugo_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/HUGO_2021-06-01_EntrezID.tsv",
"entrez_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/EntrezGeneId_HugoGeneSymbol_2021-06-01.txt",
"rna_ext_list": {
"expression": "rsem.genes.results.gz",
"fusion": "annoFuse_filter.tsv"
Expand Down
4 changes: 2 additions & 2 deletions STUDY_CONFIGS/chdm_sd_7spqtt8m_data_processing_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
"bedtools": "bedtools",
"cp_only_script": "/home/ubuntu/tools/kf-cbioportal-etl/scripts/get_cbio_copy_only_num.pl",
"bed_genes": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/Homo_sapiens.GRCh38.105.chr.gtf_genes.bed",
"hugo_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/HUGO_EntrezID.tsv",
"entrez_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/EntrezGeneId_HugoGeneSymbol.tsv",
"hugo_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/HUGO_2021-06-01_EntrezID.tsv",
"entrez_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/EntrezGeneId_HugoGeneSymbol_2021-06-01.txt",
"rna_ext_list": {
"expression": "rsem.genes.results.gz",
"fusion": "annoFuse_filter.tsv"
Expand Down
4 changes: 2 additions & 2 deletions STUDY_CONFIGS/open_chordoma_data_processing_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
"bedtools": "bedtools",
"cp_only_script": "/home/ubuntu/tools/kf-cbioportal-etl/scripts/get_cbio_copy_only_num.pl",
"bed_genes": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/Homo_sapiens.GRCh38.105.chr.gtf_genes.bed",
"hugo_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/HUGO_EntrezID.tsv",
"entrez_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/EntrezGeneId_HugoGeneSymbol.tsv",
"hugo_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/HUGO_2021-06-01_EntrezID.tsv",
"entrez_tsv": "/home/ubuntu/tools/kf-cbioportal-etl/REFS/EntrezGeneId_HugoGeneSymbol_2021-06-01.txt",
"rna_ext_list": {
"expression": "rsem.genes.results.gz",
"fusion": "annoFuse_filter.tsv"
Expand Down
90 changes: 30 additions & 60 deletions scripts/cnv_1_genome2gene.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,61 +14,41 @@ def process_cnv(cpath):
cnv_cp_only = config_data["cp_only_script"]
bed_file = config_data["bed_genes"]
hugo_tsv = config_data["hugo_tsv"]
sys.stderr.write("Processing " + cpath + "\n")
print("Processing " + cpath, file=sys.stderr)
root = os.path.basename(cpath)
limit = config_data["cnv_min_len"]
temp_len = root + ".CNVs_" + str(limit) + "_filtered.bed"
len_fh = open(out_dir + temp_len, "w")
in_cnv = open(cpath)
skip = next(in_cnv)
for cnv in in_cnv:
ct_dict["total_cnvs"] += 1
cnv_data = cnv.rstrip("\n").split("\t")
if int(cnv_data[2]) - int(cnv_data[1]) >= limit:
len_fh.write("\t".join(cnv_data[0:5]) + "\n")
else:
ct_dict["short_cnvs"] += 1
# sys.stderr.write("Entry too short " + cnv)
len_fh.close()
temp = root + ".CNVs.Genes"
to_genes_cmd = (
bedtools
+ " intersect -a "
+ out_dir
+ temp_len
+ " -b "
+ bed_file
+ " -wb > "
+ out_dir
+ temp
)
temp_filtered_fname = root + ".CNVs_0.05_filtered.bed"
with open(out_dir + temp_filtered_fname, "w") as temp_filtered:
with open(cpath) as in_cnv:
head = next(in_cnv)
header = head.rstrip('\n').split('\t')
wilcox_idx = header.index('WilcoxonRankSumTestPvalue')
ks_idx = header.index('KolmogorovSmirnovPvalue')
for cnv in in_cnv:
ct_dict["total_cnvs"] += 1
cnv_data = cnv.rstrip("\n").split("\t")
if cnv_data[wilcox_idx] == "NA" or cnv_data[ks_idx] == "NA":
ct_dict["NA"] += 1
elif float(cnv_data[wilcox_idx]) < 0.05 and float(cnv_data[ks_idx]) < 0.05:
print("\t".join(cnv_data[0:5]), file=temp_filtered)
else:
ct_dict["p_value_filter"] += 1
temp_genes_fname = root + ".CNVs.Genes"
to_genes_cmd = "{} intersect -a {}{} -b {} -wb > {}{}".format(bedtools, out_dir, temp_filtered_fname, bed_file, out_dir, temp_genes_fname)
subprocess.call(to_genes_cmd, shell=True)
out_gene_cnv_only = (
"perl "
+ cnv_cp_only
+ " "
+ hugo_tsv
+ " "
+ out_dir
+ temp
+ " > "
+ out_dir
+ temp
+ ".copy_number"
)
out_gene_cnv_only = "perl {} {} {}{} > {}{}.copy_number".format(cnv_cp_only, hugo_tsv, out_dir, temp_genes_fname, out_dir, temp_genes_fname)
check = subprocess.call(out_gene_cnv_only, shell=True)
if check:
sys.stderr.write('Error using to geens script. Check log\n')
print('Error using to genes script. Check log', file=sys.stderr)
sys.exit(1)
rm_tmp = "rm " + out_dir + temp + " " + out_dir + temp_len
rm_tmp = "rm {}{} {}{}".format(out_dir, temp_genes_fname, out_dir, temp_filtered_fname)
subprocess.call(rm_tmp, shell=True)
except Exception as e:
sys.stderr.write(str(e))
print("{}".format(e), file=sys.stderr)
sys.exit(1)


parser = argparse.ArgumentParser(
description="Convert controlFreeC cnv genome coords to gene level"
description="Convert ControlFreeC cnv genome coords to gene level and filter based on p value < 0.05"
)
parser.add_argument(
"-d",
Expand All @@ -89,27 +69,17 @@ def process_cnv(cpath):
with open(args.config_file) as f:
config_data = json.load(f)

limit = config_data["cnv_min_len"]
suffix = config_data["dna_ext_list"]["copy_number"]
sys.stderr.write("Getting cnv list\n")
print("Getting cnv list", file=sys.stderr)
flist =[os.path.join(args.cnv_dir, f) for f in os.listdir(args.cnv_dir) if os.path.isfile(os.path.join(args.cnv_dir, f))]
out_dir = "converted_cnvs/"
try:
os.mkdir(out_dir)
except:
sys.stderr.write("output dir already exists\n")
print("output dir already exists", file=sys.stderr)

ct_dict = {"total_cnvs": 0, "short_cnvs": 0}
ct_dict = {"total_cnvs": 0, "p_value_filter": 0, "NA": 0}
with concurrent.futures.ThreadPoolExecutor(config_data["cpus"]) as executor:
results = {
executor.submit(process_cnv, cpath): cpath
for cpath in flist
}
sys.stderr.write(str(ct_dict["total_cnvs"]) + " total cnv calls processed\n")
sys.stderr.write(
str(ct_dict["short_cnvs"])
+ " calls dropped for not meeting min bp size "
+ str(limit)
+ "\n"
)
sys.stderr.write("Done, check logs\n")
results = { executor.submit(process_cnv, cpath): cpath for cpath in flist }
print("{} total cnv calls processed".format(ct_dict["total_cnvs"]), file=sys.stderr)
print("{} calls dropped for not meeting p value cutoff of 0.05, {} calls dropped for having an NA p value".format(ct_dict["p_value_filter"], ct_dict["NA"]), file=sys.stderr )
10 changes: 8 additions & 2 deletions scripts/get_cbio_copy_only_num.pl
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#! usr/bin/env perl
#!/usr/bin/env perl
# Script to output a headerless TSV with hugo gene symbol <tab> entrez ID <tab> copy number for downstrema processing
# Positional args, file 1 is a Hugo <tab> entrez ID TSV, file 2 is output from cnv_1_genome2gene.py
use strict;
use warnings FATAL => 'all';
use Try::Tiny;
Expand All @@ -8,6 +10,7 @@
my %hash;
my %cbio;

# First file is hugo tsv, with gene symbols and entrez IDs
open IN, $in;
while(<IN>){
chomp;
Expand All @@ -20,15 +23,18 @@
$hash{$a[0]} = 0;
}
}
# Second file is output of bedtools intersect of ControlFreeC p value file filtered by cnv_1_genome2gene.py and a bed file with coords and gene names
open IN2, $in2;
while(<IN2>){
try{
chomp;
my @b=split(/\t/,$_);
# CN value
my $num=$b[3];
# ensure gene name is in hugo file
if (exists $hash{$b[-1]}){
# hashes by gene name <tab> entrez id
my $as="$b[-1]\t$hash{$b[-1]}";
# print "$b[-1]\t$hash{$b[-1]}\t$num\n";
$cbio{$as}=$num;
}
}
Expand Down
52 changes: 52 additions & 0 deletions utilities/compare_cn_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/usr/bin/env python3
"""
Likely a one-off script. Compare CN matrices from a previous load to a new one, summarize difference, and output number of hotspot changes
Usage:
python3 compare_cn_files.py old.tsv new.tsv hotspots.txt
"""

import pandas as pd
import sys
import numpy as np

old = pd.read_csv(sys.argv[1], sep="\t")
old.set_index("Hugo_Symbol", inplace=True)
old.sort_index(inplace=True)

new = pd.read_csv(sys.argv[2], sep="\t")
new.set_index("Hugo_Symbol", inplace=True)
new.sort_index(inplace=True)

ne_stacked = (old != new).stack()
changed = ne_stacked[ne_stacked]

difference_locations = np.where(old != new)

changed_from = old.values[difference_locations]

changed_to = new.values[difference_locations]
result = pd.DataFrame({'from': changed_from, 'to': changed_to}, index=changed.index)

with open(sys.argv[3]) as f:
hotspots = f.read().splitlines()

print("HotSpot\tCount Added\tCount Dropped")
new_added = result[result['from'] == 0]
print("{} entries will be added".format(len(new_added)), file=sys.stderr)
new_rm = result[result['to'] == 0]
print("{} entries will be dropped".format(len(new_rm)), file=sys.stderr)
clean_list = []
for gene in hotspots:
add = 0
drop = 0
try:
add = len(new_added.loc[[gene]])
except Exception as e:
print(e, file=sys.stderr)
try:
drop = len(new_rm.loc[[gene]])
except Exception as e:
print(e, file=sys.stderr)
if add or drop:
print("{}\t{}\t{}".format(gene, add, drop))
clean_list.append(gene)

0 comments on commit 37568b6

Please sign in to comment.