WIP pipeline changes

alimourey · alimourey · commit 5c5f1cc664cf · 2025-11-07T15:57:43.000-05:00
diff --git a/pipeline/context_explorer/get_context_analysis.conseq b/pipeline/context_explorer/get_context_analysis.conseq
@@ -1,17 +1,14 @@
 rule get_context_analysis:
     inputs:
         script=fileref("./get_context_analysis.py"),
-        artifacts=all {"type" ~ "subtype_tree|subtype_context_matrix|repurposing_matrix_taiga_id|repurposing_list_taiga_id|prism_oncology_reference_auc_matrix"},
+        artifacts=all {"type" ~ "subtype_tree|subtype_context_matrix|prism_oncology_reference_auc_matrix"},
 #        subtype_tree_taiga_id=all {"type":"subtype_tree"},
 #       context_matrix_taiga_id=all {"type":"subtype_context_matrix"},
         gene_effect_taiga_id={"type":"raw-dep-matrix", "label": 'Chronos_Combined'},
         gene_dependency_taiga_id={"type":"raw-dep-prob-matrix", "label": 'Chronos_Combined'},
-#        repurposing_matrix_taiga_id=all {"type": "repurposing_matrix_taiga_id"},
-#        repurposing_list_taiga_id=all {"type": "repurposing_list_taiga_id"},
+        portal_compounds={"type": "drug-metadata", "name": "merged-drugs"},
+        repurposing_matrix_taiga_id=all {"type": "drug_screen_auc_matrix", "label": "Repurposing_secondary_AUC"}
 #        oncref_auc_taiga_id=all {"type":"prism_oncology_reference_auc_matrix"},
-        compound_summaries=all {"type" ~ "compound-summary"},
-#       compound_summary_repurposing={"type": "compound-summary", "dataset": "Rep_all_single_pt"},
-#       compound_summary_oncref={"type": "compound-summary", "dataset": "Prism_oncology_AUC"},
         tda_table={"type":"tda-table"},
     outputs:
         {"type": "context_analysis", "filename": { "$filename": "context_analysis.csv"} }
@@ -20,23 +17,13 @@ rule get_context_analysis:
     
         artifacts = {{ inputs.artifacts }}
 
-        compound_tables = {{ inputs.compound_summaries }}
-    
-        # oncref_table_path is optional, because it should not be there in the public env
-        oncref_table_path = [cmpd for cmpd in compound_tables if cmpd and 'dataset' in cmpd and cmpd['dataset'] == 'Prism_oncology_AUC']
-        oncref_table_path = oncref_table_path[0] if len(oncref_table_path) > 0 else None
-        
-        # repurposing_table_path is required in all envs
-        repurposing_table_path = [cmpd for cmpd in compound_tables if cmpd and cmpd['dataset'] == 'Rep_all_single_pt']
-        assert len(repurposing_table_path) == 1, f"Expected exactly one Rep_all_single_pt compound table, got {len(repurposing_table_path)}"
-        
         # transformed will be our newly constructed dict of name -> artifact
         transformed = {
             # handle the ones that couldn't uniquely be identified by type specially
-            "repurposing_table_path": repurposing_table_path,
-            "oncref_table_path": [] if oncref_table_path is None else [oncref_table_path],
             "gene_effect_taiga_id": [ {{ inputs.gene_effect_taiga_id }} ],
             "gene_dependency_taiga_id": [ {{ inputs.gene_dependency_taiga_id }} ],
+            "portal_compounds_taiga_id": [ {{ inputs.portal_compounds }} ],
+            "repurposing_matrix_taiga_id": [ {{ inputs.repurposing_matrix_taiga_id }} ],
             "tda_table": [ {{ inputs.tda_table }} ],
             "script": {{ inputs.script }}            
         }
@@ -48,7 +35,6 @@ rule get_context_analysis:
                 ('subtype_tree_taiga_id','subtype_tree'), 
                 ('context_matrix_taiga_id', 'subtype_context_matrix'),   
                 ('repurposing_matrix_taiga_id', 'repurposing_matrix_taiga_id'), 
-                ('repurposing_list_taiga_id', 'repurposing_list_taiga_id'), 
                 ('oncref_auc_taiga_id', 'prism_oncology_reference_auc_matrix')]:
             artifact = by_type.get(type_name)
             transformed[dest_name] = [ artifact ] if artifact is not None else []
diff --git a/pipeline/context_explorer/get_context_analysis.py b/pipeline/context_explorer/get_context_analysis.py
@@ -5,7 +5,9 @@
 import warnings
 import argparse
 import json
-
+from scripts.calculate_bimodality_coefficient import (
+    bimodality_coefficient_for_cpd_viabilities,
+)
 from taigapy import create_taiga_client_v3
 
 MIN_GROUP_SIZE = 5
@@ -48,8 +50,8 @@ def load_crispr_data(
     return gene_effect, gene_dependency
 
 
-def load_prism_data(tc, repurposing_matrix_taiga_id, repurposing_list_taiga_id):
-    Extended_Primary_Compound_List = tc.get(repurposing_list_taiga_id)
+def load_prism_data(tc, repurposing_matrix_taiga_id, portal_compounds_taiga_id):
+    Portal_Compounds = tc.get(portal_compounds_taiga_id)
     Extended_Primary_Data_Matrix = tc.get(repurposing_matrix_taiga_id).T
     Data_Matrix_Discrete = Extended_Primary_Data_Matrix < np.log2(0.3)
     Data_Matrix_Discrete = Data_Matrix_Discrete.mask(
@@ -146,11 +148,9 @@ def load_all_data(
     gene_effect_taiga_id,
     gene_dependency_taiga_id,
     repurposing_matrix_taiga_id,
-    repurposing_list_taiga_id,
     oncref_auc_taiga_id,
-    repurposing_table_path,
-    oncref_table_path,
     tda_table_path,
+    portal_compounds_taiga_id,
 ):
 
     all_data_dict = dict()
@@ -182,7 +182,7 @@ def load_all_data(
     rep_sensitivity = load_prism_data(
         tc=tc,
         repurposing_matrix_taiga_id=repurposing_matrix_taiga_id,
-        repurposing_list_taiga_id=repurposing_list_taiga_id,
+        portal_compounds_taiga_id=portal_compounds_taiga_id,
     )
     datasets_to_test["PRISMRepurposing"] = rep_sensitivity
 
@@ -469,35 +469,28 @@ def compute_context_explorer_results(inputs, out_filename):
     repurposing_matrix_taiga_id = get_id_or_file_name(
         taiga_ids_or_file_name["repurposing_matrix_taiga_id"]
     )
-    repurposing_list_taiga_id = get_id_or_file_name(
-        taiga_ids_or_file_name["repurposing_list_taiga_id"]
-    )
     oncref_auc_taiga_id = get_id_or_file_name(
         taiga_ids_or_file_name["oncref_auc_taiga_id"]
     )
 
-    repurposing_table_path = get_id_or_file_name(
-        taiga_ids_or_file_name["repurposing_table_path"], id_key="filename"
-    )
-    oncref_table_path = get_id_or_file_name(
-        taiga_ids_or_file_name["oncref_table_path"], id_key="filename"
-    )
     tda_table_path = get_id_or_file_name(
         taiga_ids_or_file_name["tda_table"], id_key="filename"
     )
 
+    portal_compounds_taiga_id = get_id_or_file_name(
+        taiga_ids_or_file_name["portal_compounds_taiga_id"]
+    )
+
     ### ---- LOAD DATA ---- ###
     data_dict = load_all_data(
-        subtype_tree_taiga_id,
-        context_matrix_taiga_id,
-        gene_effect_taiga_id,
-        gene_dependency_taiga_id,
-        repurposing_matrix_taiga_id,
-        repurposing_list_taiga_id,
-        oncref_auc_taiga_id,
-        repurposing_table_path,
-        oncref_table_path,
-        tda_table_path,
+        subtype_tree_taiga_id=subtype_tree_taiga_id,
+        context_matrix_taiga_id=context_matrix_taiga_id,
+        gene_effect_taiga_id=gene_effect_taiga_id,
+        gene_dependency_taiga_id=gene_dependency_taiga_id,
+        repurposing_matrix_taiga_id=repurposing_matrix_taiga_id,
+        oncref_auc_taiga_id=oncref_auc_taiga_id,
+        tda_table_path=tda_table_path,
+        portal_compounds_taiga_id=portal_compounds_taiga_id,
     )
 
     context_explorer_results = compute_in_out_groups(**data_dict)
diff --git a/pipeline/scripts/calculate_bimodality_coefficient.py b/pipeline/scripts/calculate_bimodality_coefficient.py
@@ -0,0 +1,20 @@
+import numpy as np
+
+
+def bimodality_coefficient_for_cpd_viabilities(cpd_viabilities: pd.Series) -> pd.Series:
+    x = cpd_viabilities.dropna()
+    num_viabilities = len(x)
+    if num_viabilities > 20:
+        s1 = np.mean(x)
+        s2 = np.var(x)
+        x_ = np.divide(np.subtract(x, s1), np.sqrt(s2))
+        s3 = np.mean(np.power(x_, 3))
+        s4 = np.mean(np.power(x_, 4))
+        n = (1 - np.isnan(x)).sum()
+        bimodality_coefficient = (np.power(s3, 2) + 1) / (
+            s4 - 3 + 3 * np.power(n - 1, 2) / (np.multiply(n - 2, n - 3))
+        )
+    else:
+        bimodality_coefficient = None
+
+    return bimodality_coefficient
diff --git a/pipeline/scripts/compound_summary_merge.py b/pipeline/scripts/compound_summary_merge.py
@@ -4,6 +4,8 @@
 
 import sys
 
+from calculate_bimodality_coefficient import bimodality_coefficient_for_cpd_viabilities
+
 sys.path.append(".")
 from hdf5_utils import read_hdf5
 
@@ -136,25 +138,6 @@ def get_dose_description(df):
     merged_df.to_csv(args.output_filename, index=False, na_rep="NA")
 
 
-def bimodality_coefficient_for_cpd_viabilities(cpd_viabilities: pd.Series) -> pd.Series:
-    x = cpd_viabilities.dropna()
-    num_viabilities = len(x)
-    if num_viabilities > 20:
-        s1 = np.mean(x)
-        s2 = np.var(x)
-        x_ = np.divide(np.subtract(x, s1), np.sqrt(s2))
-        s3 = np.mean(np.power(x_, 3))
-        s4 = np.mean(np.power(x_, 4))
-        n = (1 - np.isnan(x)).sum()
-        bimodality_coefficient = (np.power(s3, 2) + 1) / (
-            s4 - 3 + 3 * np.power(n - 1, 2) / (np.multiply(n - 2, n - 3))
-        )
-    else:
-        bimodality_coefficient = None
-
-    return bimodality_coefficient
-
-
 def get_sensitive_cell_lines_count(
     dataset_viabilities_df: pd.DataFrame, units: str
 ) -> pd.Series:
diff --git a/pipeline/xrefs-common.conseq b/pipeline/xrefs-common.conseq
@@ -197,6 +197,7 @@ add-if-missing {
 
 ####### Repurposing secondary screen
 
+### NOTE if this changes it will also affect Context Explorer!
 add-if-missing {
   "type": "drug_screen_auc_matrix",
   "dataset_id": "processed-repurposing-secondary-e3aa.2/REPURPOSINGAUCMatrix",
@@ -307,7 +308,4 @@ add-if-missing {
   "dataset_id": "processed-gdsc-ee73.2/GDSC2ResponseCurves",
   "conditions_dataset_id": "processed-gdsc-ee73.2/GDSC2Log2ViabilityCollapsedConditions",
   "sample_id_prefix": "GDSC2"
-}
-
-
-
+}