Skip to content

Commit 4ffab3c

Browse files
committed
Fix pyright errors
1 parent e71aed5 commit 4ffab3c

File tree

8 files changed

+31
-168
lines changed

8 files changed

+31
-168
lines changed

portal-backend/depmap/context_explorer/api.py

Lines changed: 0 additions & 140 deletions
Original file line numberDiff line numberDiff line change
@@ -693,143 +693,3 @@ def get(self):
693693
)
694694

695695
return dataclasses.asdict(tile_data)
696-
697-
698-
### TEMPORARY FOR TESTING
699-
700-
import pandas as pd
701-
from pathlib import Path
702-
from typing import Union
703-
from depmap.database import db
704-
import re
705-
706-
707-
def fix_dataset_given_ids(value):
708-
dataset_str_to_name_mapping = {
709-
"CRISPR": "Chronos_Combined",
710-
"PRISMOncRef": "Prism_oncology_AUC_collapsed",
711-
"PRISMRepurposing": "REPURPOSING_AUC_collapsed",
712-
}
713-
714-
assert (
715-
value in dataset_str_to_name_mapping.keys()
716-
), f"This key is not present {value}"
717-
718-
return dataset_str_to_name_mapping[value]
719-
720-
721-
def extract_or_keep(value):
722-
assert not pd.isna(value)
723-
724-
# Ensure we are working with a string for regex matching
725-
value_str = str(value)
726-
727-
# Regex pattern: \((\d+)\) captures one or more digits between parentheses
728-
match = re.search(r"\s\((\d+)\)$", value_str)
729-
730-
if match:
731-
# Keep entrez ids as strings even though they could be ints to allow storing in a column that also
732-
# includes compound_ids (e.g. DPC-00001)
733-
return match.group(1)
734-
else:
735-
# If no pattern match is found, return the original value
736-
return value
737-
738-
739-
def clean_csv_duplicates(df: pd.DataFrame) -> pd.DataFrame:
740-
"""
741-
Cleans the DataFrame by removing rows that would violate the UNIQUE constraint
742-
on the combination of subtype_code, feature_id (entity_id), and out_group.
743-
744-
Args:
745-
df: The DataFrame loaded from the input CSV.
746-
747-
Returns:
748-
A DataFrame with duplicates removed based on the unique key columns.
749-
"""
750-
751-
# We must rename columns temporarily to match the required SQL column names for cleaning
752-
temp_df = df.rename(
753-
columns={"entity_id": "feature_id", "dataset": "dataset_given_id"}
754-
)
755-
756-
# Identify the columns that constitute the unique key constraint
757-
key_columns = ["subtype_code", "feature_id", "out_group", "dataset_given_id"]
758-
759-
# Identify all rows that are duplicates based on the key, keeping only the first one
760-
# This mask is True for all rows that are NOT the first occurrence of a duplicate key.
761-
duplicate_mask = temp_df.duplicated(subset=key_columns, keep="first")
762-
763-
# 1. Get the DataFrame of removed (duplicate) rows
764-
df_removed = temp_df[duplicate_mask].copy()
765-
766-
# 2. Get the cleaned DataFrame (rows that were kept)
767-
df_cleaned = temp_df[~duplicate_mask].copy()
768-
769-
return df_cleaned, df_removed
770-
771-
772-
def load_csv_to_sqlite(csv_file_path: Union[str, Path]) -> int:
773-
# Define the mapping from CSV column names to SQL table column names
774-
COLUMN_RENAME_MAP = {"entity_id": "feature_id", "dataset": "dataset_given_id"}
775-
776-
print(f"Loading data from {csv_file_path}...")
777-
778-
try:
779-
# 1. Read the CSV data into a Pandas DataFrame
780-
df = pd.read_csv(csv_file_path)
781-
except FileNotFoundError:
782-
print(f"Error: CSV file not found at {csv_file_path}")
783-
return 0
784-
785-
# 1.5. Clean the DataFrame to eliminate rows that violate the UNIQUE constraint
786-
df_cleaned, df_removed = clean_csv_duplicates(df)
787-
788-
df = df_cleaned # Use the cleaned dataframe for insertion
789-
790-
rows_removed = len(df_removed)
791-
792-
if rows_removed > 0:
793-
# Define the path for the removed rows CSV file
794-
path_obj = Path(csv_file_path)
795-
removed_csv_path = path_obj.parent / f"{path_obj.stem}_removed_duplicates.csv"
796-
797-
# Write removed rows to CSV
798-
df_removed.to_csv(removed_csv_path, index=False)
799-
800-
print(
801-
f"WARNING: Removed {rows_removed} duplicate rows based on the UNIQUE key (subtype, feature_id, out_group)."
802-
)
803-
print(f"Removed rows written to: {removed_csv_path}")
804-
805-
# 2. Rename columns to match the SQL schema
806-
df = df.rename(columns=COLUMN_RENAME_MAP)
807-
df["feature_id"] = df["feature_id"].apply(extract_or_keep)
808-
809-
df_remove_null_subtypes = df.dropna(subset=["subtype_code"])
810-
df_remove_null_subtypes["dataset_given_id"] = df_remove_null_subtypes[
811-
"dataset_given_id"
812-
].apply(fix_dataset_given_ids)
813-
814-
try:
815-
# 4. Insert data into the context_analysis table using the SQLAlchemy engine
816-
# if_exists='append' is the default and safe for new rows, but will fail on duplicates.
817-
rows_inserted = df_remove_null_subtypes.to_sql(
818-
"context_analysis", db.engine, if_exists="append", index=False
819-
)
820-
821-
print(f"Successfully inserted {rows_inserted} rows.")
822-
return rows_inserted
823-
824-
except Exception as e:
825-
print(e)
826-
# FIX: Catch the specific IntegrityError (UNIQUE constraint violation)
827-
# and abort gracefully to PREVENT the loss of context_analysis_id sequence.
828-
print(
829-
f"ERROR: Insertion failed due to UNIQUE constraint violation. "
830-
f"The primary key sequence has been preserved."
831-
)
832-
print(
833-
"To proceed, you must manually run SQL to use INSERT OR IGNORE/REPLACE, or clean your input data."
834-
)
835-
return 0

portal-backend/depmap/context_explorer/box_plot_utils.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,7 @@ def get_branch_subtype_codes_organized_by_code(sig_contexts: Dict[str, List[str]
277277
def get_sig_context_dataframe(
278278
tree_type: str,
279279
feature_type: str,
280-
feature_id: int,
280+
feature_id: str, # entrez_id for genes. compound_id for compounds.
281281
dataset_given_id: str,
282282
max_fdr: float = 0.1,
283283
min_abs_effect_size: float = 0.25,
@@ -529,7 +529,10 @@ def get_organized_contexts(
529529
def get_gene_enriched_lineages_entity_id_and_dataset_name(
530530
feature_id: str,
531531
) -> Optional[dict]:
532-
gene = Gene.get_gene_by_entrez(feature_id)
532+
gene = Gene.get_gene_by_entrez(int(feature_id))
533+
534+
assert gene is not None
535+
533536
dataset = get_dependency_dataset_for_entity(
534537
DependencyDataset.DependencyEnum.Chronos_Combined.name, gene.entity_id
535538
)

portal-backend/depmap/context_explorer/dose_curve_utils.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,9 @@ def get_context_dose_curves(
167167
key_name="auc_dataset_given_id",
168168
value_name=dataset_given_id,
169169
)
170+
171+
assert drc_dataset is not None
172+
170173
replicate_dataset_id = drc_dataset.replicate_dataset
171174

172175
replicate_dataset_name = replicate_dataset_id

portal-backend/depmap/context_explorer/models.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ class ContextPlotBoxData:
7373

7474
@dataclass
7575
class NodeEntityData:
76-
feature_id: int
76+
feature_id: str
7777
label: str
7878
feature_full_row_of_values: pd.Series
7979
feature_overview_page_label: str
@@ -395,7 +395,7 @@ def find_context_analysis_by_subtype_code_out_group(
395395
@staticmethod
396396
def get_context_dependencies(
397397
tree_type: str,
398-
feature_id: int,
398+
feature_id: str,
399399
dataset_given_id: str,
400400
feature_type: str,
401401
max_fdr: float,

portal-backend/depmap/context_explorer/utils.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import List, Literal, Optional
1+
from typing import Dict, List, Literal, Optional
22
import pandas as pd
33

44
from depmap import data_access
@@ -42,21 +42,20 @@ def get_path_to_node(selected_code: str) -> ContextPathInfo:
4242
return ContextPathInfo(path=path, tree_type=tree_type)
4343

4444

45-
# For genes, full label refers to gene_symbol (entrez_id)
46-
def get_feature_id_from_full_label(feature_type: str, feature_id: str) -> dict:
45+
def get_feature_id_from_full_label(
46+
feature_type: str, feature_id: str
47+
) -> Dict[str, str]:
4748
if feature_type == "gene":
48-
gene = Gene.get_gene_by_entrez(feature_id)
49+
gene = Gene.get_gene_by_entrez(int(feature_id))
4950

5051
assert gene is not None
5152
label = gene.label
5253
entity_overview_page_label = gene.label
53-
feature_id = gene.entrez_id
5454

5555
else:
5656
compound = Compound.get_by_compound_id(feature_id)
5757
label = compound.label
5858
entity_overview_page_label = compound.label
59-
feature_id = compound.compound_id # e.g. DPC-000001
6059

6160
return {
6261
"feature_id": feature_id,
@@ -67,7 +66,7 @@ def get_feature_id_from_full_label(feature_type: str, feature_id: str) -> dict:
6766

6867
def find_compound_dataset(
6968
datasets: List[DRCCompoundDataset], key_name: str, value_name: str
70-
) -> DRCCompoundDataset:
69+
) -> Optional[DRCCompoundDataset]:
7170
"""
7271
Searches a list of DRCCompoundDataset objects for the first object
7372
whose attribute (key_name) matches the specified value (value_name).

portal-backend/pyright-ratchet-errors.txt

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -392,13 +392,9 @@ test_executive.py: error: Argument of type "GeneFactory" cannot be assigned to p
392392
test_executive.py: error: Argument of type "GeneFactory" cannot be assigned to parameter "gene" of type "Gene" in function "format_dep_dist_info"
393393
test_executive.py: error: Argument of type "list[DependencyDatasetFactory]" cannot be assigned to parameter "datasets" of type "List[DependencyDataset]" in function "format_predictability_tile"
394394
test_executive.py: error: Argument of type "list[Unknown] | None" cannot be assigned to parameter "__obj" of type "Sized" in function "len"
395-
test_executive.py: error: Cannot access member "dataset_id" for type "DependencyDatasetFactory"
396-
test_executive.py: error: Cannot access member "entity_id" for type "CompoundExperimentFactory"
397395
test_executive.py: error: Cannot access member "units" for type "LazyAttribute"
398396
test_executive.py: error: Object of type "None" cannot be used as iterable value (reportOptionalIterable)
399397
test_executive.py: error: Object of type "None" is not subscriptable (reportOptionalSubscript)
400-
test_executive.py: error: Operator "in" not supported for types "Literal['svg']" and "dict[str, dict[Unknown, Unknown]] | dict[str, Unknown] | None"
401-
test_executive.py: error: Operator "not in" not supported for types "Literal['svg']" and "dict[str, dict[Unknown, Unknown]] | dict[str, Unknown] | None"
402398
test_factories.py: error: "cell_line_1" is possibly unbound (reportPossiblyUnboundVariable)
403399
test_get_and_process_data.py: error: "file_path" is not a known member of "None" (reportOptionalMemberAccess)
404400
test_get_and_process_data.py: error: "row_index" is not a known member of "None" (reportOptionalMemberAccess)

portal-backend/tests/depmap/context_explorer/test_box_plot_utils.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ def set_up_node_and_context_objects(
184184
]
185185

186186
feature_id = (
187-
GeneFactory(entrez_id="entrez_id").entrez_id
187+
str(GeneFactory(entrez_id="entrez_id").entrez_id)
188188
if feature_type == "gene"
189189
else CompoundFactory(compound_id="compound_id").compound_id
190190
)
@@ -316,6 +316,7 @@ def test_get_sig_context_dataframe_level_0_significant(
316316
make_level_0_significant=True,
317317
tree_type=tree_type,
318318
)
319+
feature_id = str(feature_id)
319320

320321
empty_db_mock_downloads.session.flush()
321322

@@ -359,6 +360,7 @@ def test_get_sig_context_dataframe_level_0_not_significant(
359360
make_level_0_significant=False,
360361
tree_type=tree_type,
361362
)
363+
feature_id = str(feature_id)
362364

363365
empty_db_mock_downloads.session.flush()
364366

@@ -441,6 +443,7 @@ def test_get_sig_context_data_frame_show_positive_effect_sizes(
441443
make_level_0_significant=True,
442444
tree_type=tree_type,
443445
)
446+
feature_id = str(feature_id)
444447
empty_db_mock_downloads.session.flush()
445448

446449
max_fdr, min_abs_effect_size, frac_dep_in = get_context_explorer_box_plot_filters(
@@ -537,6 +540,7 @@ def test_get_context_plot_data(
537540
make_level_0_significant=True,
538541
tree_type=tree_type,
539542
)
543+
feature_id = str(feature_id)
540544
empty_db_mock_downloads.session.flush()
541545
interactive_test_utils.reload_interactive_config()
542546

@@ -675,7 +679,7 @@ def test_get_data_to_show_if_no_contexts_significant(
675679
make_level_0_significant=False,
676680
tree_type=tree_type,
677681
)
678-
682+
feature_id = str(feature_id)
679683
empty_db_mock_downloads.session.flush()
680684
interactive_test_utils.reload_interactive_config()
681685

portal-backend/tests/depmap/context_explorer/test_context_analysis.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -127,16 +127,14 @@ def _setup_factories(
127127
empty_db_mock_downloads,
128128
dataset_given_id: str,
129129
monkeypatch,
130-
gene_a: Optional[GeneFactory] = None,
131-
gene_b: Optional[GeneFactory] = None,
132-
compound_a: Optional[CompoundFactory] = None,
133-
compound_b: Optional[CompoundFactory] = None,
134-
compound_exp_a: Optional[CompoundExperimentFactory] = None,
135-
compound_exp_b: Optional[CompoundExperimentFactory] = None,
130+
gene_a: GeneFactory,
131+
gene_b: GeneFactory,
132+
compound_a: CompoundFactory,
133+
compound_b: CompoundFactory,
134+
compound_exp_a: CompoundExperimentFactory,
135+
compound_exp_b: CompoundExperimentFactory,
136136
feature_type: Literal["gene", "compound"] = "gene",
137-
) -> List[DepmapModelFactory]:
138-
assert gene_a and gene_b if feature_type == "gene" else compound_a and compound_b
139-
137+
):
140138
use_genes = feature_type == "gene"
141139

142140
bone_es_cell_lines = [
@@ -1074,7 +1072,7 @@ def test_get_drug_dotted_line(empty_db_mock_downloads, dataset_given_id, monkeyp
10741072

10751073
interactive_test_utils.reload_interactive_config()
10761074

1077-
feature_id = gene_a.entrez_id if use_genes else compound_a.compound_id
1075+
feature_id = str(gene_a.entrez_id) if use_genes else compound_a.compound_id
10781076

10791077
(entity_full_row_of_values) = get_full_row_of_values_and_depmap_ids(
10801078
dataset_given_id=dataset_given_id, feature_id=feature_id
@@ -1146,7 +1144,7 @@ def test_get_box_plot_data(empty_db_mock_downloads, dataset_given_id, monkeypatc
11461144

11471145
interactive_test_utils.reload_interactive_config()
11481146

1149-
feature_id = gene_a.entrez_id if use_genes else compound_a.compound_id
1147+
feature_id = str(gene_a.entrez_id) if use_genes else compound_a.compound_id
11501148

11511149
### Test - the User is on the Lineage tab and selects "BONE". Then, the
11521150
### user selects a specific gene/compound from either the scatter plots or

0 commit comments

Comments
 (0)