Skip to content

Commit cdf0dab

Browse files
Change Pytask Functions to CSV due to Space Issues.
1 parent 194866e commit cdf0dab

File tree

2 files changed

+18
-15
lines changed

2 files changed

+18
-15
lines changed

src/project_mbb/data_management/filter_clean.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99

1010
def filter_relevant_variables(raw):
11-
enusc = pd.read_pickle(raw)
11+
enusc = pd.read_csv(raw)
1212
enusc_filtered = enusc[REL_VAR]
1313
return enusc_filtered
1414

src/project_mbb/data_management/task_data_management.py

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import pickle
21
from pathlib import Path
32

43
import pandas as pd
@@ -16,9 +15,9 @@
1615

1716

1817
products = {
19-
"column_labels": BLD / "data" / "column_labels.pkl",
20-
"value_labels": BLD / "data" / "value_labels.pkl",
21-
"enucs_raw": BLD / "data" / "ENUSC_raw.pkl",
18+
"column_labels": BLD / "data" / "column_labels.csv",
19+
"value_labels": BLD / "data" / "value_labels.csv",
20+
"enusc_raw": BLD / "data" / "ENUSC_raw.csv",
2221
}
2322

2423
products_filter = {
@@ -37,32 +36,36 @@ def task_create_data_meta(
3736

3837
df, meta = pyreadstat.read_sav(sav_data)
3938
df_pd = pd.DataFrame(df)
40-
df_pd.to_pickle(products["ENUSC_raw"])
39+
df_pd.to_csv(products["enusc_raw"])
4140

42-
with Path.open(products["value_labels"], "wb") as file:
43-
pickle.dump(obj=meta.value_labels, file=file)
41+
with Path.open(products["value_labels"], "w") as file:
42+
# Convert value_labels (dictionary) into a DataFrame for saving as CSV
43+
value_labels_df = pd.DataFrame(meta.value_labels)
44+
value_labels_df.to_csv(file, index=False)
4445

45-
with Path.open(products["column_labels"], "wb") as file:
46+
# Save column_labels as a CSV
47+
with Path.open(products["column_labels"], "w") as file:
48+
# Convert column_labels to DataFrame directly
4649
column_labels_df = pd.DataFrame(meta.column_labels)
47-
pickle.dump(obj=column_labels_df, file=file)
50+
column_labels_df.to_csv(file, index=False)
4851

4952

5053
def task_create_labels_var_relation(
51-
raw_enusc=BLD / "data" / "ENUSC_raw.pkl",
52-
column_labels=BLD / "column_labels.pkl",
54+
raw_enusc=BLD / "data" / "ENUSC_raw.csv",
55+
column_labels=BLD / "data" / "column_labels.csv",
5356
produces=BLD / "data" / "variable_labels.pkl",
5457
):
55-
raw = pd.read_pickle(raw_enusc)
58+
raw = pd.read_csv(raw_enusc)
5659
column_names = raw.columns.tolist()
5760
column_names = pd.DataFrame(column_names)
58-
column_labels = pd.read_pickle(column_labels)
61+
column_labels = pd.read_csv(column_labels)
5962
labels_var = pd.concat([column_names, column_labels], axis=1)
6063
labels_var.columns = ["variable_name", "label"]
6164
labels_var.to_pickle(produces)
6265

6366

6467
def task_filter_variables_and_labels(
65-
raw=BLD / "data" / "ENUSC_raw.pkl",
68+
raw=BLD / "data" / "ENUSC_raw.csv",
6669
labels_raw=BLD / "data" / "variable_labels.pkl",
6770
produces=products_filter,
6871
):

0 commit comments

Comments
 (0)