44import pyreadstat
55
66from project_mbb .config import BLD , DATA
7- from project_mbb .data_management .filter_clean import (
8- filter_labels_var ,
9- filter_relevant_variables ,
10- )
7+ from project_mbb .data_management .filter_clean import create_labels_var_relation
118
129pd .options .mode .copy_on_write = True
1310pd .options .future .infer_string = True
2724
2825
2926def task_create_data_meta (
30- sav_data = DATA / "base-usuario-20-enusc-2023.sav" , produces = products
27+ raw_data = DATA / "base-usuario-20-enusc-2023.sav" , produces = products
3128):
32- output_dir = BLD / "data"
33-
34- if not output_dir .exists ():
35- output_dir .mkdir ()
36-
37- df , meta = pyreadstat .read_sav (sav_data )
29+ df , meta = pyreadstat .read_sav (raw_data )
3830 df_pd = pd .DataFrame (df )
3931 df_pd .to_csv (products ["enusc_raw" ])
4032
@@ -50,26 +42,25 @@ def task_create_data_meta(
5042 column_labels_df .to_csv (file , index = False )
5143
5244
53- def task_create_labels_var_relation (
54- raw_enusc = BLD / "data" / "ENUSC_raw.csv" ,
55- column_labels = BLD / "data" / "column_labels.csv" ,
45+ def task_clean_labels (
46+ raw = BLD / "data" / "ENUSC_raw.csv" ,
47+ labels_raw = BLD / "data" / "column_labels.csv" ,
5648 produces = BLD / "data" / "variable_labels.pkl" ,
5749):
58- raw = pd .read_csv (raw_enusc )
59- column_names = raw .columns .tolist ()
60- column_names = pd .DataFrame (column_names )
61- column_labels = pd .read_csv (column_labels )
62- labels_var = pd .concat ([column_names , column_labels ], axis = 1 )
63- labels_var .columns = ["variable_name" , "label" ]
50+ labels_var = create_labels_var_relation (raw , labels_raw )
6451 labels_var .to_pickle (produces )
6552
6653
67- def task_filter_variables_and_labels (
68- raw = BLD / "data" / "ENUSC_raw.csv" ,
69- labels_raw = BLD / "data" / "variable_labels.pkl" ,
70- produces = products_filter ,
71- ):
72- enusc_filtered = filter_relevant_variables (raw )
73- enusc_filtered .to_pickle (products_filter ["enusc_filt" ])
74- var_labels = filter_labels_var (labels_raw )
75- var_labels .to_pickle (products_filter ["labels_filt" ])
54+ # def no_ready_task_clean_data(raw=BLD / "data" / "ENUSC_raw.csv",
55+ # labels_raw=BLD / "data" / "variable_labels.pkl",
56+ # produces=products_filter):
57+
58+ # def no_ready_filter_variables_and_labels(
59+ # raw=BLD / "data" / "ENUSC_raw.csv",
60+ # labels_raw=BLD / "data" / "variable_labels.pkl",
61+ # produces=products_filter,
62+ # ):
63+ # enusc_filtered = filter_relevant_variables(raw)
64+ # enusc_filtered.to_pickle(products_filter["enusc_filt"])
65+ # var_labels = filter_labels_var(labels_raw)
66+ # var_labels.to_pickle(products_filter["labels_filt"])
0 commit comments