Merge pull request opentargets#23 from opentargets/il-other

Studies w/ sumstats enhancement
thehyve · Jul 12, 2022 · 4b5e6f9 · 4b5e6f9
2 parents 8b9180e + 977dc99
commit 4b5e6f9
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 13 deletions.
diff --git a/configs/config.yaml b/configs/config.yaml
@@ -9,7 +9,7 @@ gwas_cat_cluster_min_loci: 10 # Minimum number of reported loci for that study t
 gwas_cat_cluster_multi_proportion: 0.3 # For a given study, if more than this proportion of loci are multi-signals (>1 signal within gwas_cat_cluster_dist_kb), the study will be clustered
 
 # Summary statistics finemapping pipeline output files
-gcs_sumstat_pattern: 'gs://genetics-portal-dev-sumstats/unfiltered/gwas/*.parquet'
+sumstats_gcs_path: 'gs://genetics-portal-dev-sumstats/unfiltered/gwas'
 toploci: 'gs://genetics-portal-dev-staging/finemapping/220113_merged/top_loci.json.gz'
 credsets: 'gs://genetics-portal-dev-staging/finemapping/220113_merged/credset/_SUCCESS' # Need the whole directory, so passing the _SUCCSS file instead to trick snakemake
 

diff --git a/scripts/study_table_to_parquet.py b/scripts/study_table_to_parquet.py
@@ -27,13 +27,10 @@ def main():
     #
 
     # Load list of study IDs that have sumstats
-    from_sumstats = set([])
-    with open(args.sumstat_studies, 'r') as in_h:
-        for line in in_h:
-            # Get study_id
-            line = line.rstrip().rstrip('/')
-            stid = os.path.basename(line).replace('.parquet', '')
-            from_sumstats.add(stid)
+    sumstat_studies = pd.read_csv(args.sumstat_studies, sep='\t', header=None, names=['study_id'])
+
+    studies_pattern = r"(?<=gs:\/\/genetics-portal-dev-sumstats\/unfiltered\/gwas\/)(.*)(?=.parquet\/)"
+    from_sumstats = list(sumstat_studies.study_id.str.extract(studies_pattern).dropna()[0].unique())
 
     # Annotate study table with field showing if there are sumstats
     merged['has_sumstats'] = merged['study_id'].isin(from_sumstats)
@@ -184,9 +181,6 @@ def main():
     # Sort output
     merged = merged.sort_values(['study_id'])
 
-    # DEBUG output study table
-    merged.to_csv('tmp/study_table.tsv', sep='\t', index=None)
-
     # Save as parquet
     array_cols = ['trait_efos', 'ancestry_initial', 'ancestry_replication']
     write_parquet(merged, args.output, str_list_cols=array_cols, compression='snappy', flavor='spark')

diff --git a/snakefiles/study_and_top_loci_tables.Snakefile b/snakefiles/study_and_top_loci_tables.Snakefile
@@ -223,11 +223,11 @@ rule list_studies_with_sumstats:
     ''' Makes a list of files with sumstats
     '''
     params:
-        url=config['gcs_sumstat_pattern'],
+        url=config['sumstats_gcs_path'],
     output:
         tmpdir + '/{version}/studies_with_sumstats.tsv'
     run:
-        shell('gsutil -m ls -d "{params.url}" > {output}')
+        shell('gsutil -m ls {params.url} > {output}')
 
 rule study_table_to_parquet:
     ''' Converts study table to final parquet.