33
44from project_mbb .config import BLD
55from project_mbb .data_management .clean_enusc import (
6+ _fill_missing ,
67 _filter_enusc ,
8+ _map_categories ,
79 _rename_enusc ,
10+ _set_data_types ,
11+ )
12+ from project_mbb .parameters import (
13+ categories ,
14+ floats ,
15+ integers ,
16+ relevant_var ,
17+ rename_mapping ,
18+ security_levels_mapping ,
19+ ses_mapping ,
20+ strings ,
821)
9- from project_mbb .parameters import relevant_var , rename_mapping
1022
1123
1224@pytest .fixture
@@ -17,21 +29,13 @@ def enusc_raw():
1729
1830
1931@pytest .fixture
20- def data_test ():
21- data = {
22- "variable_name" : ["rph_id" , "idhogar" , "enc_idr" , "enc_region" , "enc_rpc" ],
23- "label" : [
24- "Identificador de persona" ,
25- "Identificador de hogar" ,
26- "Folio de la vivienda" ,
27- "Región" ,
28- "Región, provincia, comuna" ,
29- ],
30- }
31- return pd .DataFrame (data )
32-
33-
34- def test_check_if_filter_correct_columns (enusc_raw ):
32+ def enusc_raw_short ():
33+ enusc_raw_path = BLD / "data" / "ENUSC_raw.csv"
34+ enusc_raw_short = pd .read_csv (enusc_raw_path , nrows = 3 )
35+ return enusc_raw_short
36+
37+
38+ def test_filter_correct_columns (enusc_raw ):
3539 enusc_filtered = _filter_enusc (enusc_raw , relevant_var )
3640
3741 actual_columns = set (enusc_filtered .columns )
@@ -58,3 +62,160 @@ def test_rename_enusc(enusc_raw):
5862 assert expected_columns .issubset (
5963 actual_columns
6064 ), f"Missing columns: { missing_columns } , Extra: { extra_columns } "
65+
66+
67+ @pytest .mark .parametrize (
68+ ("column" , "expected_value" ),
69+ [
70+ ("socioecon_status" , "Low SES" ),
71+ ("insec_driving" , "Secure" ),
72+ ("trust_pdi" , "Quite a bit of trust" ),
73+ ("crime_info_source_nation" , "News programs on television" ),
74+ ],
75+ )
76+ def test_value_presence (enusc_raw_short , column , expected_value ):
77+ enusc_filtered = _filter_enusc (enusc_raw_short , relevant_var )
78+ enusc_renamed = _rename_enusc (enusc_filtered , rename_mapping )
79+ enusc_category = _map_categories (enusc_renamed )
80+
81+ assert (
82+ expected_value in enusc_category [column ].values
83+ ), f" { expected_value } not found in { column } "
84+
85+
86+ @pytest .mark .parametrize (
87+ "column" ,
88+ ["socioecon_status" , "insec_driving" , "trust_pdi" , "crime_info_source_nation" ],
89+ )
90+ def test_column_is_categorical (enusc_raw_short , column ):
91+ enusc_filtered = _filter_enusc (enusc_raw_short , relevant_var )
92+ enusc_renamed = _rename_enusc (enusc_filtered , rename_mapping )
93+ enusc_category = _map_categories (enusc_renamed )
94+
95+ assert isinstance (
96+ enusc_category [column ].dtype , pd .CategoricalDtype
97+ ), f"{ column } is not categorical"
98+
99+
100+ @pytest .mark .parametrize (
101+ ("column" , "values_dict" ),
102+ [
103+ ("socioecon_status" , ses_mapping ),
104+ ("insec_driving" , security_levels_mapping ),
105+ ],
106+ )
107+ def test_value_presence_continued (enusc_raw_short , column , values_dict ):
108+ errors = []
109+ enusc_filtered = _filter_enusc (enusc_raw_short , relevant_var )
110+ enusc_renamed = _rename_enusc (enusc_filtered , rename_mapping )
111+ enusc_category = _map_categories (enusc_renamed )
112+
113+ for value in enusc_category [column ]:
114+ if pd .isna (value ): # Skip missing values
115+ continue
116+ if value not in values_dict .values ():
117+ errors .append (f"Unexpected value { value } in column '{ column } '" )
118+
119+ if errors :
120+ pytest .fail ("\n " .join (errors ))
121+
122+
123+ @pytest .mark .xfail
124+ def test_fail_age_not_categorical (enusc_raw_short ):
125+ enusc_filtered = _filter_enusc (enusc_raw_short , relevant_var )
126+ enusc_renamed = _rename_enusc (enusc_filtered , rename_mapping )
127+ enusc_category = _map_categories (enusc_renamed )
128+ assert isinstance (enusc_category ["age" ].dtype , pd .CategoricalDtype )
129+
130+
131+ def test_amount_of_missing (enusc_raw_short ):
132+ enusc_filtered = _filter_enusc (enusc_raw_short , relevant_var )
133+ enusc_renamed = _rename_enusc (enusc_filtered , rename_mapping )
134+ enusc_category = _map_categories (enusc_renamed )
135+ enusc_filled = _fill_missing (enusc_category )
136+
137+ errors = []
138+ missing_value = 99
139+
140+ for column in enusc_raw_short .columns :
141+ if column in rename_mapping :
142+ filled_column = rename_mapping [column ]
143+ if filled_column in enusc_filled .columns :
144+ missing = enusc_raw_short [column ].isna ().sum ()
145+ defined_missing = (enusc_raw_short [column ] == missing_value ).sum ()
146+ difference = enusc_filled [filled_column ].isna ().sum ()
147+
148+ if difference != missing + defined_missing :
149+ errors .append (
150+ f"Column '{ column } ' (mapped to '{ filled_column } ') failed: "
151+ f"expected missing value count of { missing + defined_missing } ,"
152+ f"but got { difference } "
153+ )
154+
155+ if errors :
156+ pytest .fail ("\n " .join (errors ))
157+
158+
159+ @pytest .mark .parametrize (
160+ ("column" , "expected_dtype" ),
161+ [(value , pd .Float64Dtype ()) for value in floats ]
162+ + [(var , pd .Int8Dtype ()) for var in integers ]
163+ + [(val , pd .CategoricalDtype ()) for val in categories ]
164+ + [(ent , "object" ) for ent in strings ],
165+ )
166+ def test_set_data_types (enusc_raw_short , column , expected_dtype ):
167+ errors = []
168+ enusc_filtered = _filter_enusc (enusc_raw_short , relevant_var )
169+ enusc_renamed = _rename_enusc (enusc_filtered , rename_mapping )
170+ enusc_category = _map_categories (enusc_renamed )
171+ enusc_filled = _fill_missing (enusc_category )
172+ enusc_dtypes = _set_data_types (enusc_filled )
173+
174+ if isinstance (expected_dtype , pd .CategoricalDtype ):
175+ if not isinstance (enusc_dtypes [column ].dtype , pd .CategoricalDtype ):
176+ errors .append (
177+ f"Column '{ column } ' failed: expected type { expected_dtype } ,"
178+ f"but got { enusc_dtypes [column ].dtype } "
179+ )
180+ elif enusc_dtypes [column ].dtype != expected_dtype :
181+ errors .append (
182+ f"Column '{ column } ' failed: expected type { expected_dtype } ,"
183+ f"but got { enusc_dtypes [column ].dtype } "
184+ )
185+
186+ if errors :
187+ pytest .fail ("\n " .join (errors ))
188+
189+
190+ @pytest .mark .xfail
191+ @pytest .mark .parametrize (
192+ ("column" , "wrong_expected_dtype" ),
193+ [(value , pd .CategoricalDtype ()) for value in floats ]
194+ + [(var , pd .Float64Dtype ()) for var in integers ]
195+ + [(val , pd .Float64Dtype ()) for val in categories ]
196+ + [(ent , "thing" ) for ent in strings ],
197+ )
198+ def test_fail_set_data_types_for_check_testing (
199+ enusc_raw_short , column , wrong_expected_dtype
200+ ):
201+ errors = []
202+ enusc_filtered = _filter_enusc (enusc_raw_short , relevant_var )
203+ enusc_renamed = _rename_enusc (enusc_filtered , rename_mapping )
204+ enusc_category = _map_categories (enusc_renamed )
205+ enusc_filled = _fill_missing (enusc_category )
206+ enusc_dtypes = _set_data_types (enusc_filled )
207+
208+ if isinstance (wrong_expected_dtype , pd .CategoricalDtype ):
209+ if not isinstance (enusc_dtypes [column ].dtype , pd .CategoricalDtype ):
210+ errors .append (
211+ f"Column '{ column } ' failed: expected type { wrong_expected_dtype } ,"
212+ f" but got { enusc_dtypes [column ].dtype } "
213+ )
214+ elif enusc_dtypes [column ].dtype != wrong_expected_dtype :
215+ errors .append (
216+ f"Column '{ column } ' failed: expected type { wrong_expected_dtype } ,"
217+ f"but got { enusc_dtypes [column ].dtype } "
218+ )
219+
220+ if errors :
221+ pytest .fail ("\n " .join (errors ))
0 commit comments