Skip to content

Commit 90a3284

Browse files
Complete tests for data mgt enusc.
1 parent a713942 commit 90a3284

File tree

1 file changed

+177
-16
lines changed

1 file changed

+177
-16
lines changed

tests/data_management/test_data_mgt_enusc.py

Lines changed: 177 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,22 @@
33

44
from project_mbb.config import BLD
55
from project_mbb.data_management.clean_enusc import (
6+
_fill_missing,
67
_filter_enusc,
8+
_map_categories,
79
_rename_enusc,
10+
_set_data_types,
11+
)
12+
from project_mbb.parameters import (
13+
categories,
14+
floats,
15+
integers,
16+
relevant_var,
17+
rename_mapping,
18+
security_levels_mapping,
19+
ses_mapping,
20+
strings,
821
)
9-
from project_mbb.parameters import relevant_var, rename_mapping
1022

1123

1224
@pytest.fixture
@@ -17,21 +29,13 @@ def enusc_raw():
1729

1830

1931
@pytest.fixture
20-
def data_test():
21-
data = {
22-
"variable_name": ["rph_id", "idhogar", "enc_idr", "enc_region", "enc_rpc"],
23-
"label": [
24-
"Identificador de persona",
25-
"Identificador de hogar",
26-
"Folio de la vivienda",
27-
"Región",
28-
"Región, provincia, comuna",
29-
],
30-
}
31-
return pd.DataFrame(data)
32-
33-
34-
def test_check_if_filter_correct_columns(enusc_raw):
32+
def enusc_raw_short():
33+
enusc_raw_path = BLD / "data" / "ENUSC_raw.csv"
34+
enusc_raw_short = pd.read_csv(enusc_raw_path, nrows=3)
35+
return enusc_raw_short
36+
37+
38+
def test_filter_correct_columns(enusc_raw):
3539
enusc_filtered = _filter_enusc(enusc_raw, relevant_var)
3640

3741
actual_columns = set(enusc_filtered.columns)
@@ -58,3 +62,160 @@ def test_rename_enusc(enusc_raw):
5862
assert expected_columns.issubset(
5963
actual_columns
6064
), f"Missing columns: {missing_columns}, Extra: {extra_columns}"
65+
66+
67+
@pytest.mark.parametrize(
68+
("column", "expected_value"),
69+
[
70+
("socioecon_status", "Low SES"),
71+
("insec_driving", "Secure"),
72+
("trust_pdi", "Quite a bit of trust"),
73+
("crime_info_source_nation", "News programs on television"),
74+
],
75+
)
76+
def test_value_presence(enusc_raw_short, column, expected_value):
77+
enusc_filtered = _filter_enusc(enusc_raw_short, relevant_var)
78+
enusc_renamed = _rename_enusc(enusc_filtered, rename_mapping)
79+
enusc_category = _map_categories(enusc_renamed)
80+
81+
assert (
82+
expected_value in enusc_category[column].values
83+
), f" {expected_value} not found in {column}"
84+
85+
86+
@pytest.mark.parametrize(
87+
"column",
88+
["socioecon_status", "insec_driving", "trust_pdi", "crime_info_source_nation"],
89+
)
90+
def test_column_is_categorical(enusc_raw_short, column):
91+
enusc_filtered = _filter_enusc(enusc_raw_short, relevant_var)
92+
enusc_renamed = _rename_enusc(enusc_filtered, rename_mapping)
93+
enusc_category = _map_categories(enusc_renamed)
94+
95+
assert isinstance(
96+
enusc_category[column].dtype, pd.CategoricalDtype
97+
), f"{column} is not categorical"
98+
99+
100+
@pytest.mark.parametrize(
101+
("column", "values_dict"),
102+
[
103+
("socioecon_status", ses_mapping),
104+
("insec_driving", security_levels_mapping),
105+
],
106+
)
107+
def test_value_presence_continued(enusc_raw_short, column, values_dict):
108+
errors = []
109+
enusc_filtered = _filter_enusc(enusc_raw_short, relevant_var)
110+
enusc_renamed = _rename_enusc(enusc_filtered, rename_mapping)
111+
enusc_category = _map_categories(enusc_renamed)
112+
113+
for value in enusc_category[column]:
114+
if pd.isna(value): # Skip missing values
115+
continue
116+
if value not in values_dict.values():
117+
errors.append(f"Unexpected value {value} in column '{column}'")
118+
119+
if errors:
120+
pytest.fail("\n".join(errors))
121+
122+
123+
@pytest.mark.xfail
124+
def test_fail_age_not_categorical(enusc_raw_short):
125+
enusc_filtered = _filter_enusc(enusc_raw_short, relevant_var)
126+
enusc_renamed = _rename_enusc(enusc_filtered, rename_mapping)
127+
enusc_category = _map_categories(enusc_renamed)
128+
assert isinstance(enusc_category["age"].dtype, pd.CategoricalDtype)
129+
130+
131+
def test_amount_of_missing(enusc_raw_short):
132+
enusc_filtered = _filter_enusc(enusc_raw_short, relevant_var)
133+
enusc_renamed = _rename_enusc(enusc_filtered, rename_mapping)
134+
enusc_category = _map_categories(enusc_renamed)
135+
enusc_filled = _fill_missing(enusc_category)
136+
137+
errors = []
138+
missing_value = 99
139+
140+
for column in enusc_raw_short.columns:
141+
if column in rename_mapping:
142+
filled_column = rename_mapping[column]
143+
if filled_column in enusc_filled.columns:
144+
missing = enusc_raw_short[column].isna().sum()
145+
defined_missing = (enusc_raw_short[column] == missing_value).sum()
146+
difference = enusc_filled[filled_column].isna().sum()
147+
148+
if difference != missing + defined_missing:
149+
errors.append(
150+
f"Column '{column}' (mapped to '{filled_column}') failed: "
151+
f"expected missing value count of {missing + defined_missing},"
152+
f"but got {difference}"
153+
)
154+
155+
if errors:
156+
pytest.fail("\n".join(errors))
157+
158+
159+
@pytest.mark.parametrize(
160+
("column", "expected_dtype"),
161+
[(value, pd.Float64Dtype()) for value in floats]
162+
+ [(var, pd.Int8Dtype()) for var in integers]
163+
+ [(val, pd.CategoricalDtype()) for val in categories]
164+
+ [(ent, "object") for ent in strings],
165+
)
166+
def test_set_data_types(enusc_raw_short, column, expected_dtype):
167+
errors = []
168+
enusc_filtered = _filter_enusc(enusc_raw_short, relevant_var)
169+
enusc_renamed = _rename_enusc(enusc_filtered, rename_mapping)
170+
enusc_category = _map_categories(enusc_renamed)
171+
enusc_filled = _fill_missing(enusc_category)
172+
enusc_dtypes = _set_data_types(enusc_filled)
173+
174+
if isinstance(expected_dtype, pd.CategoricalDtype):
175+
if not isinstance(enusc_dtypes[column].dtype, pd.CategoricalDtype):
176+
errors.append(
177+
f"Column '{column}' failed: expected type {expected_dtype},"
178+
f"but got {enusc_dtypes[column].dtype}"
179+
)
180+
elif enusc_dtypes[column].dtype != expected_dtype:
181+
errors.append(
182+
f"Column '{column}' failed: expected type {expected_dtype},"
183+
f"but got {enusc_dtypes[column].dtype}"
184+
)
185+
186+
if errors:
187+
pytest.fail("\n".join(errors))
188+
189+
190+
@pytest.mark.xfail
191+
@pytest.mark.parametrize(
192+
("column", "wrong_expected_dtype"),
193+
[(value, pd.CategoricalDtype()) for value in floats]
194+
+ [(var, pd.Float64Dtype()) for var in integers]
195+
+ [(val, pd.Float64Dtype()) for val in categories]
196+
+ [(ent, "thing") for ent in strings],
197+
)
198+
def test_fail_set_data_types_for_check_testing(
199+
enusc_raw_short, column, wrong_expected_dtype
200+
):
201+
errors = []
202+
enusc_filtered = _filter_enusc(enusc_raw_short, relevant_var)
203+
enusc_renamed = _rename_enusc(enusc_filtered, rename_mapping)
204+
enusc_category = _map_categories(enusc_renamed)
205+
enusc_filled = _fill_missing(enusc_category)
206+
enusc_dtypes = _set_data_types(enusc_filled)
207+
208+
if isinstance(wrong_expected_dtype, pd.CategoricalDtype):
209+
if not isinstance(enusc_dtypes[column].dtype, pd.CategoricalDtype):
210+
errors.append(
211+
f"Column '{column}' failed: expected type {wrong_expected_dtype},"
212+
f" but got {enusc_dtypes[column].dtype}"
213+
)
214+
elif enusc_dtypes[column].dtype != wrong_expected_dtype:
215+
errors.append(
216+
f"Column '{column}' failed: expected type {wrong_expected_dtype},"
217+
f"but got {enusc_dtypes[column].dtype}"
218+
)
219+
220+
if errors:
221+
pytest.fail("\n".join(errors))

0 commit comments

Comments
 (0)