55import pyreadstat
66
77from project_mbb .config import BLD , DATA
8+ from project_mbb .data_management .filter_clean import (
9+ filter_labels_var ,
10+ filter_relevant_variables ,
11+ )
812
913pd .options .mode .copy_on_write = True
1014pd .options .future .infer_string = True
1418products = {
1519 "column_labels" : BLD / "data" / "column_labels.pkl" ,
1620 "value_labels" : BLD / "data" / "value_labels.pkl" ,
17- "ENUSC_raw" : BLD / "data" / "ENUSC_raw.pkl" ,
21+ "enucs_raw" : BLD / "data" / "ENUSC_raw.pkl" ,
22+ }
23+
24+ products_filter = {
25+ "enusc_filt" : BLD / "data" / "enusc_filtered.pkl" ,
26+ "labels_filt" : BLD / "data" / "variable_labels_filtered.pkl" ,
1827}
1928
2029
21- def task_create_data_info (
30+ def task_create_data_meta (
2231 sav_data = DATA / "base-usuario-20-enusc-2023.sav" , produces = products
2332):
2433 output_dir = BLD / "data"
@@ -36,3 +45,28 @@ def task_create_data_info(
3645 with Path .open (products ["column_labels" ], "wb" ) as file :
3746 column_labels_df = pd .DataFrame (meta .column_labels )
3847 pickle .dump (obj = column_labels_df , file = file )
48+
49+
50+ def task_create_labels_var_relation (
51+ raw_enusc = BLD / "data" / "ENUSC_raw.pkl" ,
52+ column_labels = BLD / "column_labels.pkl" ,
53+ produces = BLD / "data" / "variable_labels.pkl" ,
54+ ):
55+ raw = pd .read_pickle (raw_enusc )
56+ column_names = raw .columns .tolist ()
57+ column_names = pd .DataFrame (column_names )
58+ column_labels = pd .read_pickle (column_labels )
59+ labels_var = pd .concat ([column_names , column_labels ], axis = 1 )
60+ labels_var .columns = ["variable_name" , "label" ]
61+ labels_var .to_pickle (produces )
62+
63+
64+ def task_filter_variables_and_labels (
65+ raw = BLD / "data" / "ENUSC_raw.pkl" ,
66+ labels_raw = BLD / "data" / "variable_labels.pkl" ,
67+ produces = products_filter ,
68+ ):
69+ enusc_filtered = filter_relevant_variables (raw )
70+ enusc_filtered .to_pickle (products_filter ["enusc_filt" ])
71+ var_labels = filter_labels_var (labels_raw )
72+ var_labels .to_pickle (products_filter ["labels_filt" ])
0 commit comments