Skip to content

Commit b53d405

Browse files
Add error handling to analysis part.
1 parent 5840cf4 commit b53d405

File tree

4 files changed

+200
-24
lines changed

4 files changed

+200
-24
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ extend-ignore = [
9696
"ARG001", # Unused function MB
9797
"ERA001", # commented MB
9898
"TRY003", # Messages outside exception MB
99+
"D415", # First line should end with a period, question mark, or exclamation MB
99100
]
100101

101102
[tool.ruff.lint.per-file-ignores]

src/project_mbb/analysis/desc_analysis.py

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import pandas as pd
22

3+
from project_mbb.parameters import commune_mapping
4+
35
pd.options.mode.copy_on_write = True
46
pd.options.future.infer_string = True
57
pd.options.plotting.backend = "plotly"
@@ -12,6 +14,8 @@ def calculate_perception_general(enusc_clean):
1214
"crime_increase_perception_neighborhood",
1315
]
1416

17+
_fail_if_missing_columns(enusc_clean, perception_columns)
18+
1519
perception_results = {}
1620

1721
for col in perception_columns:
@@ -26,6 +30,8 @@ def calculate_perception_general(enusc_clean):
2630

2731

2832
def calculate_perception_by_commune(enusc_clean):
33+
_fail_if_no_total_communes(enusc_clean, commune_mapping)
34+
2935
perception_columns = [
3036
"crime_increase_perception_nation",
3137
"crime_increase_perception_commune",
@@ -64,6 +70,8 @@ def calculate_perception_by_commune(enusc_clean):
6470

6571

6672
def calculate_perception_by_ses(enusc_clean):
73+
_fail_if_ses_not_categorical(enusc_clean)
74+
6775
perception_columns = [
6876
"crime_increase_perception_nation",
6977
"crime_increase_perception_commune",
@@ -90,13 +98,44 @@ def calculate_perception_by_ses(enusc_clean):
9098
perception_results_df = pd.concat(perception_results, axis=0)
9199

92100
perception_results_df = perception_results_df.pivot_table(
93-
index=["socioecon_status", "perception"],
94-
columns="response",
101+
index=["socioecon_status", "perception"],
102+
columns="response",
95103
values="percentage",
96-
observed=False
104+
observed=False,
97105
)
98106

99107
perception_results_df = perception_results_df.reset_index()
100108

101109
return perception_results_df
102110

111+
112+
# Error Handling
113+
114+
115+
def _fail_if_missing_columns(enusc_clean, required_columns):
116+
"""Raise a KeyError if any required columns are missing in 'enusc_clean'."""
117+
missing_cols = [col for col in required_columns if col not in enusc_clean.columns]
118+
if missing_cols:
119+
error_msg = f"Missing expected columns: {', '.join(missing_cols)}"
120+
raise KeyError(error_msg)
121+
122+
123+
def _fail_if_no_total_communes(enusc_clean, commune_mapping):
124+
"""Raise a ValueError if the number of unique communes is not 106."""
125+
enusc_communes = set(enusc_clean["commune"].unique())
126+
communes_total = set(commune_mapping.values())
127+
128+
difference = communes_total - enusc_communes
129+
if difference:
130+
error_msg = f"Missing communes: {difference}"
131+
raise ValueError(error_msg)
132+
133+
134+
def _fail_if_ses_not_categorical(enusc_clean, ses_column="socioecon_status"):
135+
"""Raise a TypeError if 'socioecon_status' is not categorical."""
136+
if not pd.api.types.is_categorical_dtype(enusc_clean[ses_column]):
137+
error_msg = (
138+
f"Column '{ses_column}' must be of type 'category',"
139+
f"but got {enusc_clean[ses_column].dtype}."
140+
)
141+
raise TypeError(error_msg)

src/project_mbb/analysis/model.py

Lines changed: 141 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
import statsmodels.api as sm
33
import statsmodels.formula.api as smf
44

5+
from project_mbb.parameters import info_sources_mapping, perception_change_mapping
6+
57

68
def regression_perception_info(enusc_clean):
79
enusc_model_pre = _set_category_values(enusc_clean)
@@ -12,17 +14,23 @@ def regression_perception_info(enusc_clean):
1214

1315

1416
def _set_category_values(enusc_clean):
15-
enusc_model = pd.DataFrame()
16-
enusc_model["crime_increase_perception_commune"] = enusc_clean[
17+
_fail_if_invalid_categories_perception(enusc_clean, perception_change_mapping)
18+
_fail_if_invalid_categories_source(enusc_clean, info_sources_mapping)
19+
20+
enusc_model_pre = pd.DataFrame()
21+
enusc_model_pre["crime_increase_perception_commune"] = enusc_clean[
1722
"crime_increase_perception_commune"
1823
].cat.codes
19-
enusc_model["crime_info_source_commune"] = enusc_clean[
24+
enusc_model_pre["crime_info_source_commune"] = enusc_clean[
2025
"crime_info_source_commune"
2126
].cat.codes
22-
return enusc_model
27+
return enusc_model_pre
2328

2429

2530
def _set_binary_for_info_source(enusc_model):
31+
_fail_if_invalid_category_values(enusc_model, "crime_increase_perception_commune")
32+
_fail_if_invalid_category_values(enusc_model, "crime_info_source_commune")
33+
2634
tech_based_codes = {2, 5, 6}
2735

2836
enusc_model["tech_based"] = enusc_model["crime_info_source_commune"].apply(
@@ -32,6 +40,8 @@ def _set_binary_for_info_source(enusc_model):
3240

3341

3442
def _drop_missing(enusc_model):
43+
_fail_if_invalid_tech_based_values(enusc_model)
44+
3545
enusc_model_clean = enusc_model[
3646
(enusc_model["crime_increase_perception_commune"] != -1)
3747
& (enusc_model["crime_info_source_commune"] != -1)
@@ -40,9 +50,136 @@ def _drop_missing(enusc_model):
4050

4151

4252
def _run_logistic_regression(enusc_model_clean):
53+
_fail_if_missing_values_after_drop(enusc_model_clean)
54+
4355
x = enusc_model_clean[["tech_based"]]
4456
x = sm.add_constant(x)
4557
model = smf.mnlogit(
4658
"crime_increase_perception_commune ~ tech_based", data=enusc_model_clean
4759
).fit()
4860
return model
61+
62+
63+
# Error Handling
64+
65+
66+
def _fail_if_invalid_categories_perception(enusc_clean, perception_change_mapping):
67+
"""Raises ValueError if the categories in 'crime_increase_perception_commune'
68+
69+
are missing.
70+
"""
71+
ignored_cat = 99
72+
perception_categories = set(
73+
enusc_clean["crime_increase_perception_commune"].cat.categories
74+
)
75+
expected_categories = set(perception_change_mapping.values())
76+
77+
perception_categories = {
78+
cat
79+
for cat in perception_categories
80+
if cat != ignored_cat and pd.isna(cat) is False
81+
}
82+
expected_categories = {
83+
cat
84+
for cat in expected_categories
85+
if cat != ignored_cat and pd.isna(cat) is False
86+
}
87+
88+
missing_categories = expected_categories - perception_categories
89+
extra_categories = perception_categories - expected_categories
90+
91+
if missing_categories or extra_categories:
92+
error_msg = (
93+
f"Invalid categories for 'crime_increase_perception_commune':\n"
94+
f"Missing categories: {missing_categories}\n"
95+
f"Extra categories: {extra_categories}"
96+
)
97+
raise ValueError(error_msg)
98+
99+
100+
def _fail_if_invalid_categories_source(enusc_clean, info_source_mapping):
101+
"""Raises ValueError if the categories in 'crime_info_source_commune'
102+
103+
are missing.
104+
"""
105+
ignored_cat = 99
106+
107+
source_categories = set(enusc_clean["crime_info_source_commune"].cat.categories)
108+
expected_categories = set(info_sources_mapping.values())
109+
110+
source_categories = {
111+
cat for cat in source_categories if cat != ignored_cat and pd.isna(cat) is False
112+
}
113+
expected_categories = {
114+
cat
115+
for cat in expected_categories
116+
if cat != ignored_cat and pd.isna(cat) is False
117+
}
118+
119+
missing_categories = expected_categories - source_categories
120+
extra_categories = source_categories - expected_categories
121+
122+
if missing_categories or extra_categories:
123+
error_msg = (
124+
f"Invalid categories for 'crime_info_source_commune':\n"
125+
f"Missing categories: {missing_categories}\n"
126+
f"Extra categories: {extra_categories}"
127+
)
128+
raise ValueError(error_msg)
129+
130+
131+
def _fail_if_invalid_category_values(enusc_model, column_name):
132+
"""Raises an error if any value in the specified column is
133+
134+
outside the valid range.
135+
"""
136+
if column_name == "crime_increase_perception_commune":
137+
valid_range = set(range(5)) | {-1} # Expecting 0 to 4
138+
elif column_name == "crime_info_source_commune":
139+
valid_range = set(range(11)) | {-1} # Expecting 0 to 9
140+
141+
column_values = enusc_model[column_name]
142+
invalid_values = column_values[~column_values.isin(valid_range)].unique()
143+
144+
if len(invalid_values) > 0:
145+
error_msg = (
146+
f"Invalid category values in '{column_name}': {invalid_values}."
147+
f"Expected values within {valid_range}."
148+
)
149+
raise ValueError(error_msg)
150+
151+
152+
def _fail_if_invalid_tech_based_values(enusc_model):
153+
"""Raises an error if 'tech_based' column contains values other than -1, 1 or 0."""
154+
valid_values = {0, 1, -1}
155+
invalid_values = enusc_model[~enusc_model["tech_based"].isin(valid_values)][
156+
"tech_based"
157+
].unique()
158+
159+
if len(invalid_values) > 0:
160+
error_msg = (
161+
f"Invalid values in 'tech_based' column: {invalid_values}. "
162+
" Expected values are 0 and 1."
163+
)
164+
raise ValueError(error_msg)
165+
166+
167+
def _fail_if_missing_values_after_drop(enusc_model_clean):
168+
"""Raises an error if there are still missing values (i.e., -1 or NaN)
169+
170+
in the specified columns.
171+
"""
172+
missing_values = enusc_model_clean[
173+
(enusc_model_clean["crime_increase_perception_commune"] == -1)
174+
| (enusc_model_clean["crime_info_source_commune"] == -1)
175+
| enusc_model_clean["crime_increase_perception_commune"].isna()
176+
| enusc_model_clean["crime_info_source_commune"].isna()
177+
]
178+
179+
if not missing_values.empty:
180+
error_msg = (
181+
f"Missing values after dropping: {missing_values.shape[0]}"
182+
"rows contain missing values in 'crime_increase_perception_commune'"
183+
" or 'crime_info_source_commune'."
184+
)
185+
raise ValueError(error_msg)

work_maren/tests_maren.ipynb

Lines changed: 16 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -127392,37 +127392,36 @@
127392127392
},
127393127393
{
127394127394
"cell_type": "code",
127395-
"execution_count": 168,
127395+
"execution_count": 179,
127396127396
"metadata": {},
127397127397
"outputs": [
127398127398
{
127399127399
"data": {
127400127400
"text/plain": [
127401-
"<bound method Series.unique of 0 2\n",
127402-
"1 -1\n",
127403-
"2 -1\n",
127404-
"3 -1\n",
127405-
"4 -1\n",
127406-
" ..\n",
127407-
"146289 -1\n",
127408-
"146290 2\n",
127409-
"146291 2\n",
127410-
"146292 -1\n",
127411-
"146293 -1\n",
127412-
"Length: 146294, dtype: int8>"
127401+
"Index(['Don't know',\n",
127402+
" 'Information from other non-family people, friends, neighbors, or acquaintances',\n",
127403+
" 'Information gathered from social media, Facebook, Twitter, TikTok, and similar',\n",
127404+
" 'Information provided by family members',\n",
127405+
" 'National newspapers in print and/or electronic form',\n",
127406+
" 'News programs on television', 'Other',\n",
127407+
" 'Other tv programs not news: morning shows, documentaries, or special programs',\n",
127408+
" 'Personal experience',\n",
127409+
" 'Regional or local newspapers in print and/or electronic form',\n",
127410+
" 'Through the radio'],\n",
127411+
" dtype='string')"
127413127412
]
127414127413
},
127415-
"execution_count": 168,
127414+
"execution_count": 179,
127416127415
"metadata": {},
127417127416
"output_type": "execute_result"
127418127417
}
127419127418
],
127420127419
"source": [
127421127420
"# Show the categories and their corresponding codes\n",
127422-
"categories = enusc_clean[\"crime_increase_perception_nation\"].cat.categories\n",
127423-
"codes = enusc_clean[\"crime_increase_perception_nation\"].cat.codes\n",
127421+
"categories = enusc_clean[\"crime_info_source_commune\"].cat.categories\n",
127422+
"codes = enusc_clean[\"crime_info_source_commune\"].cat.codes\n",
127424127423
"categories\n",
127425-
"codes\n",
127424+
"\n",
127426127425
"\n",
127427127426
"\n",
127428127427
"#0 corresponds to 'Decreased'\n",

0 commit comments

Comments
 (0)