Skip to content

Commit 9c461f5

Browse files
Update of var-label relation functions for pytask.
1 parent cdf0dab commit 9c461f5

File tree

11 files changed

+17113
-404
lines changed

11 files changed

+17113
-404
lines changed

.DS_Store

0 Bytes
Binary file not shown.

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,8 @@ extend-ignore = [
9393
"RET504", # Don't force to calculate upon return
9494
"S101", # Use of `assert` detected.
9595
"S301", # pickle module is unsafe
96-
"ARG001", # Unused function
96+
"ARG001", # Unused function MB
97+
"ERA001", # commented MB
9798
]
9899

99100
[tool.ruff.lint.per-file-ignores]

src/.DS_Store

0 Bytes
Binary file not shown.

src/project_mbb/.DS_Store

0 Bytes
Binary file not shown.

src/project_mbb/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
DOCUMENTS = ROOT.joinpath("documents").resolve()
1717

1818
REL_VAR = [
19-
"enc_rpc",
19+
"0" "enc_rpc",
2020
"rph_ID",
2121
"idhogar",
2222
"enc_idr",

src/project_mbb/data/.DS_Store

0 Bytes
Binary file not shown.

src/project_mbb/data_management/filter_clean.py

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,40 @@
77
pd.options.plotting.backend = "plotly"
88

99

10-
def filter_relevant_variables(raw):
11-
enusc = pd.read_csv(raw)
12-
enusc_filtered = enusc[REL_VAR]
13-
return enusc_filtered
10+
def create_labels_var_relation(raw_enusc, labels_raw):
11+
labels_var = _concat_labels_var(raw_enusc, labels_raw)
12+
labels_var_filt = _filter_labels_var(labels_var)
13+
return labels_var_filt
1414

1515

16-
def filter_labels_var(labels_raw):
17-
var_labels = pd.read_pickle(labels_raw)
18-
var_labels = var_labels[var_labels["variable_name"].isin(REL_VAR)]
19-
return var_labels
16+
def _concat_labels_var(raw_enusc, labels_raw):
17+
raw = pd.read_csv(raw_enusc)
18+
column_names = raw.columns.tolist()
19+
column_names = pd.DataFrame(column_names)
20+
column_names = column_names.iloc[1:]
21+
column_labels = pd.read_csv(labels_raw)
22+
column_labels.index = column_names.index
23+
labels_var = pd.concat([column_names, column_labels], axis=1)
24+
labels_var.columns = ["variable_name", "label"]
25+
# labels_var = pd.DataFrame(labels_var)
26+
return labels_var
27+
28+
29+
def _filter_labels_var(var_labels):
30+
var_labels_filt = var_labels[var_labels["variable_name"].isin(REL_VAR)]
31+
return var_labels_filt
32+
33+
34+
# def _no_ready_clean_enusc(
35+
# raw=BLD / "data" / "ENUSC_raw.csv",
36+
# labels_raw=BLD / "data" / "variable_labels.pkl",
37+
# produces=products_filter,
38+
# ):
39+
# enusc_filtered = _filter_relevant_variables(raw)
40+
# enusc_filtered.to_pickle(products_filter["enusc_filt"])
41+
42+
43+
# def _no_ready_filter_relevant_variables(raw):
44+
# enusc = pd.read_csv(raw)
45+
# enusc_filtered = enusc[REL_VAR]
46+
# return enusc_filtered

src/project_mbb/data_management/task_data_management.py

Lines changed: 20 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,7 @@
44
import pyreadstat
55

66
from project_mbb.config import BLD, DATA
7-
from project_mbb.data_management.filter_clean import (
8-
filter_labels_var,
9-
filter_relevant_variables,
10-
)
7+
from project_mbb.data_management.filter_clean import create_labels_var_relation
118

129
pd.options.mode.copy_on_write = True
1310
pd.options.future.infer_string = True
@@ -27,14 +24,9 @@
2724

2825

2926
def task_create_data_meta(
30-
sav_data=DATA / "base-usuario-20-enusc-2023.sav", produces=products
27+
raw_data=DATA / "base-usuario-20-enusc-2023.sav", produces=products
3128
):
32-
output_dir = BLD / "data"
33-
34-
if not output_dir.exists():
35-
output_dir.mkdir()
36-
37-
df, meta = pyreadstat.read_sav(sav_data)
29+
df, meta = pyreadstat.read_sav(raw_data)
3830
df_pd = pd.DataFrame(df)
3931
df_pd.to_csv(products["enusc_raw"])
4032

@@ -50,26 +42,25 @@ def task_create_data_meta(
5042
column_labels_df.to_csv(file, index=False)
5143

5244

53-
def task_create_labels_var_relation(
54-
raw_enusc=BLD / "data" / "ENUSC_raw.csv",
55-
column_labels=BLD / "data" / "column_labels.csv",
45+
def task_clean_labels(
46+
raw=BLD / "data" / "ENUSC_raw.csv",
47+
labels_raw=BLD / "data" / "column_labels.csv",
5648
produces=BLD / "data" / "variable_labels.pkl",
5749
):
58-
raw = pd.read_csv(raw_enusc)
59-
column_names = raw.columns.tolist()
60-
column_names = pd.DataFrame(column_names)
61-
column_labels = pd.read_csv(column_labels)
62-
labels_var = pd.concat([column_names, column_labels], axis=1)
63-
labels_var.columns = ["variable_name", "label"]
50+
labels_var = create_labels_var_relation(raw, labels_raw)
6451
labels_var.to_pickle(produces)
6552

6653

67-
def task_filter_variables_and_labels(
68-
raw=BLD / "data" / "ENUSC_raw.csv",
69-
labels_raw=BLD / "data" / "variable_labels.pkl",
70-
produces=products_filter,
71-
):
72-
enusc_filtered = filter_relevant_variables(raw)
73-
enusc_filtered.to_pickle(products_filter["enusc_filt"])
74-
var_labels = filter_labels_var(labels_raw)
75-
var_labels.to_pickle(products_filter["labels_filt"])
54+
# def no_ready_task_clean_data(raw=BLD / "data" / "ENUSC_raw.csv",
55+
# labels_raw=BLD / "data" / "variable_labels.pkl",
56+
# produces=products_filter):
57+
58+
# def no_ready_filter_variables_and_labels(
59+
# raw=BLD / "data" / "ENUSC_raw.csv",
60+
# labels_raw=BLD / "data" / "variable_labels.pkl",
61+
# produces=products_filter,
62+
# ):
63+
# enusc_filtered = filter_relevant_variables(raw)
64+
# enusc_filtered.to_pickle(products_filter["enusc_filt"])
65+
# var_labels = filter_labels_var(labels_raw)
66+
# var_labels.to_pickle(products_filter["labels_filt"])

tests/possible_tests.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
- test the function for merge label and variable to see if there is a label for each person and check some label var relations to see if they are correct in different parts of the code.

0 commit comments

Comments
 (0)