Skip to content

Commit

Permalink
Merge pull request opentargets#23 from opentargets/il-other
Browse files Browse the repository at this point in the history
Studies w/ sumstats enhancement
  • Loading branch information
bruno-ariano authored Jul 12, 2022
2 parents 8b9180e + 977dc99 commit 4b5e6f9
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 13 deletions.
2 changes: 1 addition & 1 deletion configs/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ gwas_cat_cluster_min_loci: 10 # Minimum number of reported loci for that study t
gwas_cat_cluster_multi_proportion: 0.3 # For a given study, if more than this proportion of loci are multi-signals (>1 signal within gwas_cat_cluster_dist_kb), the study will be clustered

# Summary statistics finemapping pipeline output files
gcs_sumstat_pattern: 'gs://genetics-portal-dev-sumstats/unfiltered/gwas/*.parquet'
sumstats_gcs_path: 'gs://genetics-portal-dev-sumstats/unfiltered/gwas'
toploci: 'gs://genetics-portal-dev-staging/finemapping/220113_merged/top_loci.json.gz'
credsets: 'gs://genetics-portal-dev-staging/finemapping/220113_merged/credset/_SUCCESS' # Need the whole directory, so passing the _SUCCSS file instead to trick snakemake

Expand Down
14 changes: 4 additions & 10 deletions scripts/study_table_to_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,10 @@ def main():
#

# Load list of study IDs that have sumstats
from_sumstats = set([])
with open(args.sumstat_studies, 'r') as in_h:
for line in in_h:
# Get study_id
line = line.rstrip().rstrip('/')
stid = os.path.basename(line).replace('.parquet', '')
from_sumstats.add(stid)
sumstat_studies = pd.read_csv(args.sumstat_studies, sep='\t', header=None, names=['study_id'])

studies_pattern = r"(?<=gs:\/\/genetics-portal-dev-sumstats\/unfiltered\/gwas\/)(.*)(?=.parquet\/)"
from_sumstats = list(sumstat_studies.study_id.str.extract(studies_pattern).dropna()[0].unique())

# Annotate study table with field showing if there are sumstats
merged['has_sumstats'] = merged['study_id'].isin(from_sumstats)
Expand Down Expand Up @@ -184,9 +181,6 @@ def main():
# Sort output
merged = merged.sort_values(['study_id'])

# DEBUG output study table
merged.to_csv('tmp/study_table.tsv', sep='\t', index=None)

# Save as parquet
array_cols = ['trait_efos', 'ancestry_initial', 'ancestry_replication']
write_parquet(merged, args.output, str_list_cols=array_cols, compression='snappy', flavor='spark')
Expand Down
4 changes: 2 additions & 2 deletions snakefiles/study_and_top_loci_tables.Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -223,11 +223,11 @@ rule list_studies_with_sumstats:
''' Makes a list of files with sumstats
'''
params:
url=config['gcs_sumstat_pattern'],
url=config['sumstats_gcs_path'],
output:
tmpdir + '/{version}/studies_with_sumstats.tsv'
run:
shell('gsutil -m ls -d "{params.url}" > {output}')
shell('gsutil -m ls {params.url} > {output}')

rule study_table_to_parquet:
''' Converts study table to final parquet.
Expand Down

0 comments on commit 4b5e6f9

Please sign in to comment.