1- import numpy as np
1+ import pandas as pd
22import pytest
33
4+ from project_mbb .config import BLD
45from project_mbb .data_management .clean_labels import (
5- create_labels_var_relation ,
66 _concat_labels_var ,
77 _filter_labels_var ,
88 _rename_variables_eng ,
99)
1010
11- def assert_categorical_equal (left , right ):
12- assert_series_equal (pd .Series (left ), pd .Series (right ))
1311
14- def test_concat_labels_var (data ):
15- data_test = _concat_labels_var (data )
12+ @pytest .fixture
13+ def enusc_raw ():
14+ enusc_raw_path = BLD / "data" / "ENUSC_raw.csv"
15+ enusc_raw = pd .read_csv (enusc_raw_path )
16+ return enusc_raw
1617
17- column = ["variable_name" , "label" ]
18- variable_name = {"rph_ID, " Kish ", " Conglomerado ", }
1918
20- data_test [ "rph_ID" ] = "Identificador de persona"
21- data_test [ "Kish" ] = "Informante Kish"
22- data_test [ "Conglomerado" ] = "Pseudoconglomerado "
23- data_test [ "HUR" ] = "Hurto consumado"
24-
19+ @ pytest . fixture
20+ def labels_raw ():
21+ labels_raw_path = BLD / "data" / "column_labels.csv "
22+ labels_raw = pd . read_csv ( labels_raw_path )
23+ return labels_raw
2524
26-
25+
26+ @pytest .fixture
27+ def data_test ():
28+ data = {
29+ "variable_name" : ["rph_id" , "idhogar" , "enc_idr" , "enc_region" , "enc_rpc" ],
30+ "label" : [
31+ "Identificador de persona" ,
32+ "Identificador de hogar" ,
33+ "Folio de la vivienda" ,
34+ "Región" ,
35+ "Región, provincia, comuna" ,
36+ ],
37+ }
38+ return pd .DataFrame (data )
39+
40+
41+ def test_concat_labels_var (enusc_raw , labels_raw ):
42+ data_test = _concat_labels_var (enusc_raw , labels_raw )
43+
44+ assert (
45+ "variable_name" in data_test .columns
46+ ), "Column 'variable_name' not found in DataFrame"
47+ assert "label" in data_test .columns , "Column 'label' not found in DataFrame"
48+
49+ # indexes are original ones: s.t if we want to include other variables into
50+ # the filtering then we can continue using this test
51+ assert all (
52+ [
53+ data_test ["variable_name" ].iloc [0 ] == "rph_ID"
54+ and data_test ["label" ].iloc [0 ] == "Identificador de persona" ,
55+ data_test ["variable_name" ].iloc [1247 ] == "HUR"
56+ and data_test ["label" ].iloc [1247 ] == "Hurto consumado" ,
57+ data_test ["variable_name" ].iloc [1300 ] == "Conglomerado"
58+ and data_test ["label" ].iloc [1300 ] == "Pseudoconglomerado" ,
59+ ]
60+ )
61+
62+
63+ def test_filter_labels_var (data_test ):
64+ expected = {
65+ "variable_name" : ["rph_id" , "enc_region" ],
66+ "label" : [
67+ "Identificador de persona" ,
68+ "Región" ,
69+ ],
70+ }
71+
72+ var_filt = ["rph_id" , "enc_region" ]
73+
74+ var_labels_fake = data_test
75+ expected_df = pd .DataFrame (expected )
76+
77+ data_filt = _filter_labels_var (var_labels_fake , var_filt )
78+ data_filt = data_filt .reset_index (drop = True )
79+ expected_df = expected_df .reset_index (drop = True )
80+
81+ pd .testing .assert_frame_equal (data_filt , expected_df , check_like = False )
82+
83+
84+ def test_rename_variable_eng (data_test ):
85+ rename_mapping = {
86+ "rph_id" : "person_id" ,
87+ "idhogar" : "household_id" ,
88+ "enc_idr" : "housing_folio" ,
89+ "enc_region" : "region" ,
90+ "enc_rpc" : "commune" ,
91+ }
92+
93+ renamed_data_test = _rename_variables_eng (data_test , rename_mapping )
94+
95+ expected = {
96+ "variable_name" : [
97+ "person_id" ,
98+ "household_id" ,
99+ "housing_folio" ,
100+ "region" ,
101+ "commune" ,
102+ ],
103+ "label" : [
104+ "Identificador de persona" ,
105+ "Identificador de hogar" ,
106+ "Folio de la vivienda" ,
107+ "Región" ,
108+ "Región, provincia, comuna" ,
109+ ],
110+ }
111+
112+ expected_df = pd .DataFrame (expected )
113+
114+ pd .testing .assert_frame_equal (renamed_data_test , expected_df , check_like = True )
115+
116+
117+ def test_task_clean_labels_shape ():
118+ labels_clean_path = BLD / "data" / "variable_labels_clean.pkl"
119+ labels_clean = pd .read_pickle (labels_clean_path )
120+ shape = labels_clean .shape
121+ expected_shape = (66 , 2 )
122+ assert expected_shape == shape
0 commit comments