Fix pyright errors

alimourey · alimourey · commit 4ffab3ca2b1c · 2025-11-07T14:45:11.000-05:00
diff --git a/portal-backend/depmap/context_explorer/api.py b/portal-backend/depmap/context_explorer/api.py
@@ -693,143 +693,3 @@ def get(self):
         )
 
         return dataclasses.asdict(tile_data)
-
-
-### TEMPORARY FOR TESTING
-
-import pandas as pd
-from pathlib import Path
-from typing import Union
-from depmap.database import db
-import re
-
-
-def fix_dataset_given_ids(value):
-    dataset_str_to_name_mapping = {
-        "CRISPR": "Chronos_Combined",
-        "PRISMOncRef": "Prism_oncology_AUC_collapsed",
-        "PRISMRepurposing": "REPURPOSING_AUC_collapsed",
-    }
-
-    assert (
-        value in dataset_str_to_name_mapping.keys()
-    ), f"This key is not present {value}"
-
-    return dataset_str_to_name_mapping[value]
-
-
-def extract_or_keep(value):
-    assert not pd.isna(value)
-
-    # Ensure we are working with a string for regex matching
-    value_str = str(value)
-
-    # Regex pattern: \((\d+)\) captures one or more digits between parentheses
-    match = re.search(r"\s\((\d+)\)$", value_str)
-
-    if match:
-        # Keep entrez ids as strings even though they could be ints to allow storing in a column that also
-        # includes compound_ids (e.g. DPC-00001)
-        return match.group(1)
-    else:
-        # If no pattern match is found, return the original value
-        return value
-
-
-def clean_csv_duplicates(df: pd.DataFrame) -> pd.DataFrame:
-    """
-    Cleans the DataFrame by removing rows that would violate the UNIQUE constraint
-    on the combination of subtype_code, feature_id (entity_id), and out_group.
-
-    Args:
-        df: The DataFrame loaded from the input CSV.
-
-    Returns:
-        A DataFrame with duplicates removed based on the unique key columns.
-    """
-
-    # We must rename columns temporarily to match the required SQL column names for cleaning
-    temp_df = df.rename(
-        columns={"entity_id": "feature_id", "dataset": "dataset_given_id"}
-    )
-
-    # Identify the columns that constitute the unique key constraint
-    key_columns = ["subtype_code", "feature_id", "out_group", "dataset_given_id"]
-
-    # Identify all rows that are duplicates based on the key, keeping only the first one
-    # This mask is True for all rows that are NOT the first occurrence of a duplicate key.
-    duplicate_mask = temp_df.duplicated(subset=key_columns, keep="first")
-
-    # 1. Get the DataFrame of removed (duplicate) rows
-    df_removed = temp_df[duplicate_mask].copy()
-
-    # 2. Get the cleaned DataFrame (rows that were kept)
-    df_cleaned = temp_df[~duplicate_mask].copy()
-
-    return df_cleaned, df_removed
-
-
-def load_csv_to_sqlite(csv_file_path: Union[str, Path]) -> int:
-    # Define the mapping from CSV column names to SQL table column names
-    COLUMN_RENAME_MAP = {"entity_id": "feature_id", "dataset": "dataset_given_id"}
-
-    print(f"Loading data from {csv_file_path}...")
-
-    try:
-        # 1. Read the CSV data into a Pandas DataFrame
-        df = pd.read_csv(csv_file_path)
-    except FileNotFoundError:
-        print(f"Error: CSV file not found at {csv_file_path}")
-        return 0
-
-    # 1.5. Clean the DataFrame to eliminate rows that violate the UNIQUE constraint
-    df_cleaned, df_removed = clean_csv_duplicates(df)
-
-    df = df_cleaned  # Use the cleaned dataframe for insertion
-
-    rows_removed = len(df_removed)
-
-    if rows_removed > 0:
-        # Define the path for the removed rows CSV file
-        path_obj = Path(csv_file_path)
-        removed_csv_path = path_obj.parent / f"{path_obj.stem}_removed_duplicates.csv"
-
-        # Write removed rows to CSV
-        df_removed.to_csv(removed_csv_path, index=False)
-
-        print(
-            f"WARNING: Removed {rows_removed} duplicate rows based on the UNIQUE key (subtype, feature_id, out_group)."
-        )
-        print(f"Removed rows written to: {removed_csv_path}")
-
-    # 2. Rename columns to match the SQL schema
-    df = df.rename(columns=COLUMN_RENAME_MAP)
-    df["feature_id"] = df["feature_id"].apply(extract_or_keep)
-
-    df_remove_null_subtypes = df.dropna(subset=["subtype_code"])
-    df_remove_null_subtypes["dataset_given_id"] = df_remove_null_subtypes[
-        "dataset_given_id"
-    ].apply(fix_dataset_given_ids)
-
-    try:
-        # 4. Insert data into the context_analysis table using the SQLAlchemy engine
-        # if_exists='append' is the default and safe for new rows, but will fail on duplicates.
-        rows_inserted = df_remove_null_subtypes.to_sql(
-            "context_analysis", db.engine, if_exists="append", index=False
-        )
-
-        print(f"Successfully inserted {rows_inserted} rows.")
-        return rows_inserted
-
-    except Exception as e:
-        print(e)
-        # FIX: Catch the specific IntegrityError (UNIQUE constraint violation)
-        # and abort gracefully to PREVENT the loss of context_analysis_id sequence.
-        print(
-            f"ERROR: Insertion failed due to UNIQUE constraint violation. "
-            f"The primary key sequence has been preserved."
-        )
-        print(
-            "To proceed, you must manually run SQL to use INSERT OR IGNORE/REPLACE, or clean your input data."
-        )
-        return 0
diff --git a/portal-backend/depmap/context_explorer/box_plot_utils.py b/portal-backend/depmap/context_explorer/box_plot_utils.py
@@ -277,7 +277,7 @@ def get_branch_subtype_codes_organized_by_code(sig_contexts: Dict[str, List[str]
 def get_sig_context_dataframe(
     tree_type: str,
     feature_type: str,
-    feature_id: int,
+    feature_id: str,  # entrez_id for genes. compound_id for compounds.
     dataset_given_id: str,
     max_fdr: float = 0.1,
     min_abs_effect_size: float = 0.25,
@@ -529,7 +529,10 @@ def get_organized_contexts(
 def get_gene_enriched_lineages_entity_id_and_dataset_name(
     feature_id: str,
 ) -> Optional[dict]:
-    gene = Gene.get_gene_by_entrez(feature_id)
+    gene = Gene.get_gene_by_entrez(int(feature_id))
+
+    assert gene is not None
+
     dataset = get_dependency_dataset_for_entity(
         DependencyDataset.DependencyEnum.Chronos_Combined.name, gene.entity_id
     )
diff --git a/portal-backend/depmap/context_explorer/dose_curve_utils.py b/portal-backend/depmap/context_explorer/dose_curve_utils.py
@@ -167,6 +167,9 @@ def get_context_dose_curves(
         key_name="auc_dataset_given_id",
         value_name=dataset_given_id,
     )
+
+    assert drc_dataset is not None
+
     replicate_dataset_id = drc_dataset.replicate_dataset
 
     replicate_dataset_name = replicate_dataset_id
diff --git a/portal-backend/depmap/context_explorer/models.py b/portal-backend/depmap/context_explorer/models.py
@@ -73,7 +73,7 @@ class ContextPlotBoxData:
 
 @dataclass
 class NodeEntityData:
-    feature_id: int
+    feature_id: str
     label: str
     feature_full_row_of_values: pd.Series
     feature_overview_page_label: str
@@ -395,7 +395,7 @@ def find_context_analysis_by_subtype_code_out_group(
     @staticmethod
     def get_context_dependencies(
         tree_type: str,
-        feature_id: int,
+        feature_id: str,
         dataset_given_id: str,
         feature_type: str,
         max_fdr: float,
diff --git a/portal-backend/depmap/context_explorer/utils.py b/portal-backend/depmap/context_explorer/utils.py
@@ -1,4 +1,4 @@
-from typing import List, Literal, Optional
+from typing import Dict, List, Literal, Optional
 import pandas as pd
 
 from depmap import data_access
@@ -42,21 +42,20 @@ def get_path_to_node(selected_code: str) -> ContextPathInfo:
     return ContextPathInfo(path=path, tree_type=tree_type)
 
 
-# For genes, full label refers to gene_symbol (entrez_id)
-def get_feature_id_from_full_label(feature_type: str, feature_id: str) -> dict:
+def get_feature_id_from_full_label(
+    feature_type: str, feature_id: str
+) -> Dict[str, str]:
     if feature_type == "gene":
-        gene = Gene.get_gene_by_entrez(feature_id)
+        gene = Gene.get_gene_by_entrez(int(feature_id))
 
         assert gene is not None
         label = gene.label
         entity_overview_page_label = gene.label
-        feature_id = gene.entrez_id
 
     else:
         compound = Compound.get_by_compound_id(feature_id)
         label = compound.label
         entity_overview_page_label = compound.label
-        feature_id = compound.compound_id  # e.g. DPC-000001
 
     return {
         "feature_id": feature_id,
@@ -67,7 +66,7 @@ def get_feature_id_from_full_label(feature_type: str, feature_id: str) -> dict:
 
 def find_compound_dataset(
     datasets: List[DRCCompoundDataset], key_name: str, value_name: str
-) -> DRCCompoundDataset:
+) -> Optional[DRCCompoundDataset]:
     """
     Searches a list of DRCCompoundDataset objects for the first object 
     whose attribute (key_name) matches the specified value (value_name).
diff --git a/portal-backend/pyright-ratchet-errors.txt b/portal-backend/pyright-ratchet-errors.txt
@@ -392,13 +392,9 @@ test_executive.py: error: Argument of type "GeneFactory" cannot be assigned to p
 test_executive.py: error: Argument of type "GeneFactory" cannot be assigned to parameter "gene" of type "Gene" in function "format_dep_dist_info"
 test_executive.py: error: Argument of type "list[DependencyDatasetFactory]" cannot be assigned to parameter "datasets" of type "List[DependencyDataset]" in function "format_predictability_tile"
 test_executive.py: error: Argument of type "list[Unknown] | None" cannot be assigned to parameter "__obj" of type "Sized" in function "len"
-test_executive.py: error: Cannot access member "dataset_id" for type "DependencyDatasetFactory"
-test_executive.py: error: Cannot access member "entity_id" for type "CompoundExperimentFactory"
 test_executive.py: error: Cannot access member "units" for type "LazyAttribute"
 test_executive.py: error: Object of type "None" cannot be used as iterable value (reportOptionalIterable)
 test_executive.py: error: Object of type "None" is not subscriptable (reportOptionalSubscript)
-test_executive.py: error: Operator "in" not supported for types "Literal['svg']" and "dict[str, dict[Unknown, Unknown]] | dict[str, Unknown] | None"
-test_executive.py: error: Operator "not in" not supported for types "Literal['svg']" and "dict[str, dict[Unknown, Unknown]] | dict[str, Unknown] | None"
 test_factories.py: error: "cell_line_1" is possibly unbound (reportPossiblyUnboundVariable)
 test_get_and_process_data.py: error: "file_path" is not a known member of "None" (reportOptionalMemberAccess)
 test_get_and_process_data.py: error: "row_index" is not a known member of "None" (reportOptionalMemberAccess)
diff --git a/portal-backend/tests/depmap/context_explorer/test_box_plot_utils.py b/portal-backend/tests/depmap/context_explorer/test_box_plot_utils.py
@@ -184,7 +184,7 @@ def set_up_node_and_context_objects(
     ]
 
     feature_id = (
-        GeneFactory(entrez_id="entrez_id").entrez_id
+        str(GeneFactory(entrez_id="entrez_id").entrez_id)
         if feature_type == "gene"
         else CompoundFactory(compound_id="compound_id").compound_id
     )
@@ -316,6 +316,7 @@ def test_get_sig_context_dataframe_level_0_significant(
         make_level_0_significant=True,
         tree_type=tree_type,
     )
+    feature_id = str(feature_id)
 
     empty_db_mock_downloads.session.flush()
 
@@ -359,6 +360,7 @@ def test_get_sig_context_dataframe_level_0_not_significant(
         make_level_0_significant=False,
         tree_type=tree_type,
     )
+    feature_id = str(feature_id)
 
     empty_db_mock_downloads.session.flush()
 
@@ -441,6 +443,7 @@ def test_get_sig_context_data_frame_show_positive_effect_sizes(
         make_level_0_significant=True,
         tree_type=tree_type,
     )
+    feature_id = str(feature_id)
     empty_db_mock_downloads.session.flush()
 
     max_fdr, min_abs_effect_size, frac_dep_in = get_context_explorer_box_plot_filters(
@@ -537,6 +540,7 @@ def test_get_context_plot_data(
         make_level_0_significant=True,
         tree_type=tree_type,
     )
+    feature_id = str(feature_id)
     empty_db_mock_downloads.session.flush()
     interactive_test_utils.reload_interactive_config()
 
@@ -675,7 +679,7 @@ def test_get_data_to_show_if_no_contexts_significant(
         make_level_0_significant=False,
         tree_type=tree_type,
     )
-
+    feature_id = str(feature_id)
     empty_db_mock_downloads.session.flush()
     interactive_test_utils.reload_interactive_config()
 
diff --git a/portal-backend/tests/depmap/context_explorer/test_context_analysis.py b/portal-backend/tests/depmap/context_explorer/test_context_analysis.py
@@ -127,16 +127,14 @@ def _setup_factories(
     empty_db_mock_downloads,
     dataset_given_id: str,
     monkeypatch,
-    gene_a: Optional[GeneFactory] = None,
-    gene_b: Optional[GeneFactory] = None,
-    compound_a: Optional[CompoundFactory] = None,
-    compound_b: Optional[CompoundFactory] = None,
-    compound_exp_a: Optional[CompoundExperimentFactory] = None,
-    compound_exp_b: Optional[CompoundExperimentFactory] = None,
+    gene_a: GeneFactory,
+    gene_b: GeneFactory,
+    compound_a: CompoundFactory,
+    compound_b: CompoundFactory,
+    compound_exp_a: CompoundExperimentFactory,
+    compound_exp_b: CompoundExperimentFactory,
     feature_type: Literal["gene", "compound"] = "gene",
-) -> List[DepmapModelFactory]:
-    assert gene_a and gene_b if feature_type == "gene" else compound_a and compound_b
-
+):
     use_genes = feature_type == "gene"
 
     bone_es_cell_lines = [
@@ -1074,7 +1072,7 @@ def test_get_drug_dotted_line(empty_db_mock_downloads, dataset_given_id, monkeyp
 
     interactive_test_utils.reload_interactive_config()
 
-    feature_id = gene_a.entrez_id if use_genes else compound_a.compound_id
+    feature_id = str(gene_a.entrez_id) if use_genes else compound_a.compound_id
 
     (entity_full_row_of_values) = get_full_row_of_values_and_depmap_ids(
         dataset_given_id=dataset_given_id, feature_id=feature_id
@@ -1146,7 +1144,7 @@ def test_get_box_plot_data(empty_db_mock_downloads, dataset_given_id, monkeypatc
 
     interactive_test_utils.reload_interactive_config()
 
-    feature_id = gene_a.entrez_id if use_genes else compound_a.compound_id
+    feature_id = str(gene_a.entrez_id) if use_genes else compound_a.compound_id
 
     ### Test - the User is on the Lineage tab and selects "BONE". Then, the
     ### user selects a specific gene/compound from either the scatter plots or

Original file line number	Diff line number	Diff line change
`@@ -167,6 +167,9 @@ def get_context_dose_curves(`
`167`	`167`	`key_name="auc_dataset_given_id",`
`168`	`168`	`value_name=dataset_given_id,`
`169`	`169`	`)`
	`170`	`+`
	`171`	`+ assert drc_dataset is not None`
	`172`	`+`
`170`	`173`	`replicate_dataset_id = drc_dataset.replicate_dataset`
`171`	`174`
`172`	`175`	`replicate_dataset_name = replicate_dataset_id`
Original file line number	Diff line number	Diff line change
`@@ -184,7 +184,7 @@ def set_up_node_and_context_objects(`
`184`	`184`	`]`
`185`	`185`
`186`	`186`	`feature_id = (`
`187`		`- GeneFactory(entrez_id="entrez_id").entrez_id`
	`187`	`+ str(GeneFactory(entrez_id="entrez_id").entrez_id)`
`188`	`188`	`if feature_type == "gene"`
`189`	`189`	`else CompoundFactory(compound_id="compound_id").compound_id`
`190`	`190`	`)`
`@@ -316,6 +316,7 @@ def test_get_sig_context_dataframe_level_0_significant(`
`316`	`316`	`make_level_0_significant=True,`
`317`	`317`	`tree_type=tree_type,`
`318`	`318`	`)`
	`319`	`+ feature_id = str(feature_id)`
`319`	`320`
`320`	`321`	`empty_db_mock_downloads.session.flush()`
`321`	`322`
`@@ -359,6 +360,7 @@ def test_get_sig_context_dataframe_level_0_not_significant(`
`359`	`360`	`make_level_0_significant=False,`
`360`	`361`	`tree_type=tree_type,`
`361`	`362`	`)`
	`363`	`+ feature_id = str(feature_id)`
`362`	`364`
`363`	`365`	`empty_db_mock_downloads.session.flush()`
`364`	`366`
`@@ -441,6 +443,7 @@ def test_get_sig_context_data_frame_show_positive_effect_sizes(`
`441`	`443`	`make_level_0_significant=True,`
`442`	`444`	`tree_type=tree_type,`
`443`	`445`	`)`
	`446`	`+ feature_id = str(feature_id)`
`444`	`447`	`empty_db_mock_downloads.session.flush()`
`445`	`448`
`446`	`449`	`max_fdr, min_abs_effect_size, frac_dep_in = get_context_explorer_box_plot_filters(`
`@@ -537,6 +540,7 @@ def test_get_context_plot_data(`
`537`	`540`	`make_level_0_significant=True,`
`538`	`541`	`tree_type=tree_type,`
`539`	`542`	`)`
	`543`	`+ feature_id = str(feature_id)`
`540`	`544`	`empty_db_mock_downloads.session.flush()`
`541`	`545`	`interactive_test_utils.reload_interactive_config()`
`542`	`546`
`@@ -675,7 +679,7 @@ def test_get_data_to_show_if_no_contexts_significant(`
`675`	`679`	`make_level_0_significant=False,`
`676`	`680`	`tree_type=tree_type,`
`677`	`681`	`)`
`678`		`-`
	`682`	`+ feature_id = str(feature_id)`
`679`	`683`	`empty_db_mock_downloads.session.flush()`
`680`	`684`	`interactive_test_utils.reload_interactive_config()`
`681`	`685`