diff --git a/configs/config.yaml b/configs/config.yaml index 5be48ce..cae05ba 100644 --- a/configs/config.yaml +++ b/configs/config.yaml @@ -9,7 +9,7 @@ gwas_cat_cluster_min_loci: 10 # Minimum number of reported loci for that study t gwas_cat_cluster_multi_proportion: 0.3 # For a given study, if more than this proportion of loci are multi-signals (>1 signal within gwas_cat_cluster_dist_kb), the study will be clustered # Summary statistics finemapping pipeline output files -gcs_sumstat_pattern: 'gs://genetics-portal-dev-sumstats/unfiltered/gwas/*.parquet' +sumstats_gcs_path: 'gs://genetics-portal-dev-sumstats/unfiltered/gwas' toploci: 'gs://genetics-portal-dev-staging/finemapping/220113_merged/top_loci.json.gz' credsets: 'gs://genetics-portal-dev-staging/finemapping/220113_merged/credset/_SUCCESS' # Need the whole directory, so passing the _SUCCSS file instead to trick snakemake diff --git a/scripts/study_table_to_parquet.py b/scripts/study_table_to_parquet.py index b8b96e6..c3cb20f 100644 --- a/scripts/study_table_to_parquet.py +++ b/scripts/study_table_to_parquet.py @@ -27,13 +27,10 @@ def main(): # # Load list of study IDs that have sumstats - from_sumstats = set([]) - with open(args.sumstat_studies, 'r') as in_h: - for line in in_h: - # Get study_id - line = line.rstrip().rstrip('/') - stid = os.path.basename(line).replace('.parquet', '') - from_sumstats.add(stid) + sumstat_studies = pd.read_csv(args.sumstat_studies, sep='\t', header=None, names=['study_id']) + + studies_pattern = r"(?<=gs:\/\/genetics-portal-dev-sumstats\/unfiltered\/gwas\/)(.*)(?=.parquet\/)" + from_sumstats = list(sumstat_studies.study_id.str.extract(studies_pattern).dropna()[0].unique()) # Annotate study table with field showing if there are sumstats merged['has_sumstats'] = merged['study_id'].isin(from_sumstats) @@ -184,9 +181,6 @@ def main(): # Sort output merged = merged.sort_values(['study_id']) - # DEBUG output study table - merged.to_csv('tmp/study_table.tsv', sep='\t', index=None) - # Save as parquet array_cols = ['trait_efos', 'ancestry_initial', 'ancestry_replication'] write_parquet(merged, args.output, str_list_cols=array_cols, compression='snappy', flavor='spark') diff --git a/snakefiles/study_and_top_loci_tables.Snakefile b/snakefiles/study_and_top_loci_tables.Snakefile index 13317d8..71b2e3e 100644 --- a/snakefiles/study_and_top_loci_tables.Snakefile +++ b/snakefiles/study_and_top_loci_tables.Snakefile @@ -223,11 +223,11 @@ rule list_studies_with_sumstats: ''' Makes a list of files with sumstats ''' params: - url=config['gcs_sumstat_pattern'], + url=config['sumstats_gcs_path'], output: tmpdir + '/{version}/studies_with_sumstats.tsv' run: - shell('gsutil -m ls -d "{params.url}" > {output}') + shell('gsutil -m ls {params.url} > {output}') rule study_table_to_parquet: ''' Converts study table to final parquet.