Skip to content

Commit 194866e

Browse files
Unify Filter and Label Functions in Pytask Format.
1 parent c221826 commit 194866e

File tree

7 files changed

+20968
-237
lines changed

7 files changed

+20968
-237
lines changed

.DS_Store

0 Bytes
Binary file not shown.

src/.DS_Store

0 Bytes
Binary file not shown.

src/project_mbb/.DS_Store

0 Bytes
Binary file not shown.

src/project_mbb/config.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515

1616
DOCUMENTS = ROOT.joinpath("documents").resolve()
1717

18-
1918
REL_VAR = [
2019
"enc_rpc",
2120
"rph_ID",
@@ -93,9 +92,6 @@
9392
"HUR_MOTIV_NO_DEN",
9493
"SCREEN_ROB_HUR",
9594
"HUR_DENUNCIAS",
96-
]
97-
98-
AUX_VAR = [
9995
"PAD",
10096
"PADC",
10197
"PADB",
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import pandas as pd
2+
3+
from project_mbb.config import REL_VAR
4+
5+
pd.options.mode.copy_on_write = True
6+
pd.options.future.infer_string = True
7+
pd.options.plotting.backend = "plotly"
8+
9+
10+
def filter_relevant_variables(raw):
11+
enusc = pd.read_pickle(raw)
12+
enusc_filtered = enusc[REL_VAR]
13+
return enusc_filtered
14+
15+
16+
def filter_labels_var(labels_raw):
17+
var_labels = pd.read_pickle(labels_raw)
18+
var_labels = var_labels[var_labels["variable_name"].isin(REL_VAR)]
19+
return var_labels

src/project_mbb/data_management/task_data_management.py

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55
import pyreadstat
66

77
from project_mbb.config import BLD, DATA
8+
from project_mbb.data_management.filter_clean import (
9+
filter_labels_var,
10+
filter_relevant_variables,
11+
)
812

913
pd.options.mode.copy_on_write = True
1014
pd.options.future.infer_string = True
@@ -14,11 +18,16 @@
1418
products = {
1519
"column_labels": BLD / "data" / "column_labels.pkl",
1620
"value_labels": BLD / "data" / "value_labels.pkl",
17-
"ENUSC_raw": BLD / "data" / "ENUSC_raw.pkl",
21+
"enucs_raw": BLD / "data" / "ENUSC_raw.pkl",
22+
}
23+
24+
products_filter = {
25+
"enusc_filt": BLD / "data" / "enusc_filtered.pkl",
26+
"labels_filt": BLD / "data" / "variable_labels_filtered.pkl",
1827
}
1928

2029

21-
def task_create_data_info(
30+
def task_create_data_meta(
2231
sav_data=DATA / "base-usuario-20-enusc-2023.sav", produces=products
2332
):
2433
output_dir = BLD / "data"
@@ -36,3 +45,28 @@ def task_create_data_info(
3645
with Path.open(products["column_labels"], "wb") as file:
3746
column_labels_df = pd.DataFrame(meta.column_labels)
3847
pickle.dump(obj=column_labels_df, file=file)
48+
49+
50+
def task_create_labels_var_relation(
51+
raw_enusc=BLD / "data" / "ENUSC_raw.pkl",
52+
column_labels=BLD / "column_labels.pkl",
53+
produces=BLD / "data" / "variable_labels.pkl",
54+
):
55+
raw = pd.read_pickle(raw_enusc)
56+
column_names = raw.columns.tolist()
57+
column_names = pd.DataFrame(column_names)
58+
column_labels = pd.read_pickle(column_labels)
59+
labels_var = pd.concat([column_names, column_labels], axis=1)
60+
labels_var.columns = ["variable_name", "label"]
61+
labels_var.to_pickle(produces)
62+
63+
64+
def task_filter_variables_and_labels(
65+
raw=BLD / "data" / "ENUSC_raw.pkl",
66+
labels_raw=BLD / "data" / "variable_labels.pkl",
67+
produces=products_filter,
68+
):
69+
enusc_filtered = filter_relevant_variables(raw)
70+
enusc_filtered.to_pickle(products_filter["enusc_filt"])
71+
var_labels = filter_labels_var(labels_raw)
72+
var_labels.to_pickle(products_filter["labels_filt"])

tests_maren.ipynb

Lines changed: 20913 additions & 231 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)