Skip to content

Commit 5840cf4

Browse files
Add error handling to data mgt part.
1 parent e97ba00 commit 5840cf4

File tree

13 files changed

+20940
-69
lines changed

13 files changed

+20940
-69
lines changed

.DS_Store

0 Bytes
Binary file not shown.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ extend-ignore = [
9595
"S301", # pickle module is unsafe
9696
"ARG001", # Unused function MB
9797
"ERA001", # commented MB
98+
"TRY003", # Messages outside exception MB
9899
]
99100

100101
[tool.ruff.lint.per-file-ignores]

src/.DS_Store

0 Bytes
Binary file not shown.

src/project_mbb/.DS_Store

0 Bytes
Binary file not shown.

src/project_mbb/data/.DS_Store

0 Bytes
Binary file not shown.

src/project_mbb/data_management/clean_enusc.py

Lines changed: 71 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,19 @@ def clean_enusc(raw_enusc):
1919
enusc_filtered = _filter_enusc(raw_enusc, relevant_var)
2020
enusc_renamed = _rename_enusc(enusc_filtered, rename_mapping)
2121
enusc_mapped = _map_categories(enusc_renamed)
22-
# here check for no objects present
2322
enusc_filled = _fill_missing(enusc_mapped)
2423
enusc_dtypes = _set_data_types_not_mapped_var(enusc_filled)
2524
return enusc_dtypes
2625

2726

2827
def _filter_enusc(raw_enusc, relevant_var):
28+
_fail_if_not_list(relevant_var)
2929
enusc_filtered = raw_enusc[relevant_var]
3030
return enusc_filtered
3131

3232

3333
def _rename_enusc(enusc_filtered, rename_mapping):
34+
_fail_if_not_equal_length(enusc_filtered, rename_mapping)
3435
enusc_renamed = enusc_filtered.copy()
3536
enusc_renamed.columns = enusc_renamed.columns.str.lower()
3637
enusc_renamed = enusc_renamed.rename(columns=rename_mapping)
@@ -52,7 +53,12 @@ def _map_categories(enusc_renamed):
5253

5354

5455
def _fill_missing(enusc_mapped):
56+
_fail_if_not_dataframe(enusc_mapped)
57+
_fail_if_missing_columns(enusc_mapped, categories, "categories")
58+
_fail_if_missing_columns(enusc_mapped, map_category, "map_category")
59+
5560
replacements = {99: pd.NA, 77: "Other", 88: "Don't know", 85: "Doesn't apply"}
61+
5662
enusc_filling = enusc_mapped.copy()
5763
for column in categories:
5864
enusc_filling[column] = enusc_filling[column].astype(pd.Int8Dtype())
@@ -61,11 +67,7 @@ def _fill_missing(enusc_mapped):
6167
)
6268

6369
if enusc_mapped[column].dtype != "object":
64-
enusc_filling[column] = (
65-
enusc_filling[column]
66-
# .astype(pd.Int8Dtype()) esto se puede borrar
67-
.astype(pd.CategoricalDtype())
68-
)
70+
enusc_filling[column] = enusc_filling[column].astype(pd.CategoricalDtype())
6971
enusc_filling[column] = enusc_filling[column].cat.rename_categories(
7072
lambda x: replacements.get(x, x)
7173
)
@@ -86,6 +88,11 @@ def _fill_missing(enusc_mapped):
8688

8789

8890
def _set_data_types_not_mapped_var(enusc_filled):
91+
_fail_if_columns_not_found(enusc_filled, floats)
92+
_fail_if_columns_not_found(enusc_filled, integers)
93+
_fail_if_columns_not_found(enusc_filled, categories)
94+
_fail_if_columns_not_found(enusc_filled, strings)
95+
8996
enusc_dtypes = enusc_filled.copy()
9097
for value in floats:
9198
enusc_dtypes[value] = enusc_dtypes[value].astype(pd.Float64Dtype())
@@ -96,3 +103,61 @@ def _set_data_types_not_mapped_var(enusc_filled):
96103
for ent in strings:
97104
enusc_dtypes[ent] = enusc_dtypes[ent].astype(str)
98105
return enusc_dtypes
106+
107+
108+
# Error Handling
109+
110+
111+
def _fail_if_not_list(relevant_var):
112+
"""Raise TypeError if relevant_var for filtering is not a list."""
113+
if not isinstance(relevant_var, list):
114+
error_msg = f"Expected a list, but got {type(relevant_var).__name__}"
115+
raise TypeError(error_msg)
116+
117+
118+
class ShapeError(Exception):
119+
"""Custom exception for errors in _rename_enusc."""
120+
121+
122+
def _fail_if_not_equal_length(enusc_filtered, rename_mapping):
123+
"""Raise ShapeError if data and renaming dictionary have unequal width.
124+
125+
The number of columns in `enusc_filtered` must match the length of
126+
`rename_mapping`.
127+
"""
128+
if enusc_filtered.shape[1] != len(rename_mapping):
129+
error_msg = (
130+
f"Lists have unequal widths: "
131+
f"{enusc_filtered.shape[1]} vs {len(rename_mapping)}"
132+
)
133+
raise ShapeError(error_msg)
134+
135+
136+
class MissingError(Exception):
137+
"""Custom exception for missing."""
138+
139+
140+
def _fail_if_not_dataframe(enusc_mapped):
141+
"""Raise an error if enusc_mapped is not a Pandas DataFrame."""
142+
if not isinstance(enusc_mapped, pd.DataFrame):
143+
error_msg = f"Expected a DataFrame, but got {type(enusc_mapped).__name__}"
144+
raise TypeError(error_msg)
145+
146+
147+
def _fail_if_missing_columns(enusc_mapped, column_list, list_name):
148+
"""Raise an error if any column in column_list is missing from enusc_mapped."""
149+
missing_columns = [col for col in column_list if col not in enusc_mapped.columns]
150+
if missing_columns:
151+
error_msg = (
152+
f"The following columns from '{list_name}'"
153+
f"are missing in the DataFrame: {missing_columns}"
154+
)
155+
raise MissingError(error_msg)
156+
157+
158+
def _fail_if_columns_not_found(enusc_filled, column_list):
159+
"""Raise an error if a column in the list is not found in the DataFrame."""
160+
missing_columns = [col for col in column_list if col not in enusc_filled.columns]
161+
if missing_columns:
162+
error_msg = f"Columns not found in DataFrame: {', '.join(missing_columns)}"
163+
raise MissingError(error_msg)

src/project_mbb/data_management/clean_labels.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ def create_labels_var_relation(raw_enusc, column_labels):
1515

1616

1717
def _concat_labels_var(raw_enusc, column_labels):
18+
_fail_if_mismatched_length(raw_enusc, column_labels)
19+
1820
column_names = raw_enusc.columns.tolist()
1921
column_names = pd.DataFrame(column_names)
2022
column_names = column_names.iloc[1:]
@@ -31,5 +33,55 @@ def _filter_labels_var(var_labels, relevant_var):
3133

3234
def _rename_variables_eng(labels_var_filt, rename_mapping):
3335
labels_var_filt["variable_name"] = labels_var_filt["variable_name"].str.lower()
36+
37+
_fail_if_mismatched_values(labels_var_filt, relevant_var)
38+
3439
labels_renamed = labels_var_filt.replace({"variable_name": rename_mapping})
3540
return labels_renamed
41+
42+
43+
# Error Handling
44+
45+
46+
class ShapeError(Exception):
47+
"""Custom exception for errors in _rename_enusc."""
48+
49+
50+
def _fail_if_mismatched_length(raw_enusc, column_labels):
51+
"""Ensure 'raw_enusc' columns match 'column_labels' length.
52+
53+
Raises an error if not fulfilled.
54+
"""
55+
expected_length = len(column_labels)
56+
57+
if len(raw_enusc.columns) - 1 != expected_length:
58+
error_msg = (
59+
f"Column length mismatch: {len(raw_enusc.columns)} columns in 'raw_enusc', "
60+
)
61+
f"but {expected_length} labels in 'column_labels'."
62+
raise ShapeError(error_msg)
63+
64+
65+
class MissingError(Exception):
66+
"""Custom exception for missing."""
67+
68+
69+
def _fail_if_mismatched_values(labels_var_filt, relevant_var):
70+
"""Raise an error if 'variable_name' and 'relevant_var' are not equal.
71+
72+
Also isplays the mismatches.
73+
"""
74+
var_labels_values = set(labels_var_filt["variable_name"].values)
75+
relevant_var_lower = {x.lower() for x in relevant_var}
76+
77+
missing_in_var_labels = relevant_var_lower - var_labels_values
78+
missing_in_relevant_var = var_labels_values - relevant_var_lower
79+
80+
mismatched_vars = list(missing_in_var_labels | missing_in_relevant_var)
81+
82+
if mismatched_vars:
83+
error_msg = (
84+
"The 'variable_name' values do not match 'relevant_var' values.\n",
85+
)
86+
f"Missing or mismatched variables: {', '.join(mismatched_vars)}"
87+
raise MissingError(error_msg)

src/project_mbb/final/plot.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,5 +44,4 @@ def plot_crime_perception(enusc_clean):
4444
template="plotly_white",
4545
)
4646

47-
# Show figure
4847
fig.write_image(i)

src/project_mbb/parameters.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,14 @@
8383
"rph_nivel": "education_level",
8484
"rph_ciuo": "occupation_classification",
8585
"rph_nse": "socioecon_status",
86+
"fact_pers_com": "personal_facts_commununal",
87+
"fact_pers_reg": "personal_facts_regional",
88+
"fact_pers_regional_102": "personal_facts_regional_102",
89+
"fact_hog_com": "household_facts_commununal",
90+
"fact_hog_reg": "household_facts_regional",
91+
"fact_hog_regional_102": "household_facts_regional_102",
92+
"varstrat": "stratum_variables",
93+
"conglomerado": "conglomerate",
8694
# perception of crime increase
8795
"p_aumento_pais": "crime_increase_perception_nation",
8896
"p_aumento_com": "crime_increase_perception_commune",
@@ -467,15 +475,15 @@
467475

468476

469477
floats = {
470-
"fact_pers_com",
471-
"fact_pers_reg",
472-
"fact_pers_regional_102",
473-
"fact_hog_com",
474-
"fact_hog_reg",
475-
"fact_hog_regional_102",
476-
"varstrat",
477-
"conglomerado",
478478
"housing_folio",
479+
"personal_facts_commununal",
480+
"personal_facts_regional",
481+
"personal_facts_regional_102",
482+
"household_facts_commununal",
483+
"household_facts_regional",
484+
"household_facts_regional_102",
485+
"stratum_variables",
486+
"conglomerate",
479487
}
480488

481489
integers = {

tests/data_management/test_data_mgt_enusc.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,15 @@
2424
@pytest.fixture
2525
def enusc_raw():
2626
enusc_raw_path = BLD / "data" / "ENUSC_raw.csv"
27-
enusc_raw = pd.read_csv(enusc_raw_path)
27+
enusc_raw = pd.read_csv(enusc_raw_path, engine="pyarrow")
2828
return enusc_raw
2929

3030

3131
@pytest.fixture
3232
def enusc_raw_short():
3333
enusc_raw_path = BLD / "data" / "ENUSC_raw.csv"
34-
enusc_raw_short = pd.read_csv(enusc_raw_path, nrows=3)
34+
enusc_raw = pd.read_csv(enusc_raw_path, engine="pyarrow")
35+
enusc_raw_short = enusc_raw.head(3)
3536
return enusc_raw_short
3637

3738

0 commit comments

Comments
 (0)