Skip to content

Commit 0d82523

Browse files
committed
Modified Data-Processing to write release into filenames
1 parent ec21c7b commit 0d82523

File tree

1 file changed

+12
-16
lines changed

1 file changed

+12
-16
lines changed

Data-Processing.ipynb

+12-16
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
"import pickle\n",
4040
"from clean import *\n",
4141
"from reed import regex_select\n",
42+
"from hilda_config import release_suffix\n",
4243
"\n",
4344
"pd.options.display.max_columns=100\n",
4445
"pd.options.display.max_colwidth=200\n",
@@ -80,7 +81,7 @@
8081
"correlation_threshold = 0.90\n",
8182
"redundant_threshold=0.9\n",
8283
"test = False\n",
83-
"release = \"general\" # \"restricted\""
84+
"release = \"restricted\" # \"restricted\""
8485
]
8586
},
8687
{
@@ -191,16 +192,11 @@
191192
"\n",
192193
"\n",
193194
"# read the combined file for the starting wave\n",
194-
"release_suffix = {\n",
195-
" \"general\": \"c\",\n",
196-
" \"restricted\": \"u\"\n",
197-
"}\n",
198-
"\n",
199-
"sfx = config.release_suffix[release]\n",
195+
"sfx = release_suffix[release]\n",
200196
"df1, meta1 = pyreadstat.read_sav(f'data/part1/Combined {s}190{sfx}.sav')\n",
201197
"n0 = len(df1)\n",
202198
"print(f\"Number of people in initial wave {n0}\")\n",
203-
"with open('data/metadata.pkl','wb') as f:\n",
199+
"with open(f'data/metadata_{release}.pkl','wb') as f:\n",
204200
" pickle.dump(meta1,f)\n",
205201
" \n",
206202
"df1 = filter_participants(df1,min_start_age, already_studying_cols)"
@@ -687,7 +683,7 @@
687683
"\n",
688684
"if not test:\n",
689685
" X, columns_dropped, r_vals = filter_raw_data(df1.copy(), missing_threshold=missing_threshold,correlation_threshold=correlation_threshold)\n",
690-
" write_data(X, treatment_outcomes, \"all_vars.csv\")"
686+
" write_data(X, treatment_outcomes, f\"all_vars_{release}.csv\")"
691687
]
692688
},
693689
{
@@ -834,7 +830,7 @@
834830
" print(\"\\n\")\n",
835831
" return f_selected\n",
836832
"\n",
837-
"def write_selected_featureset(X, columns_dropped, features, tag):\n",
833+
"def write_selected_featureset(X, columns_dropped, features, tag, release):\n",
838834
" selection = ['xwaveid']\n",
839835
" missing = []\n",
840836
" for f in features:\n",
@@ -844,7 +840,7 @@
844840
" reason = columns_dropped.get(f,\"unkown\")\n",
845841
" print(f\"Column {f} not present in X, reason:{reason}\")\n",
846842
" \n",
847-
" write_data(X[selection], treatment_outcomes, f\"all_lasso_selected_{tag}.csv\")"
843+
" write_data(X[selection], treatment_outcomes, f\"all_lasso_selected_{tag}_{release}.csv\")"
848844
]
849845
},
850846
{
@@ -862,10 +858,10 @@
862858
"\n",
863859
" X, columns_dropped, r_vals = filter_raw_data(df1.copy(), missing_threshold=0.99,correlation_threshold=1)\n",
864860
"\n",
865-
" write_selected_featureset(X, columns_dropped, f10, 10)\n",
866-
" write_selected_featureset(X, columns_dropped, f20, 20)\n",
867-
" write_selected_featureset(X, columns_dropped, f50, 50)\n",
868-
" write_selected_featureset(X, columns_dropped, f100, 100)\n",
861+
" write_selected_featureset(X, columns_dropped, f10, 10, release)\n",
862+
" write_selected_featureset(X, columns_dropped, f20, 20, release)\n",
863+
" write_selected_featureset(X, columns_dropped, f50, 50, release)\n",
864+
" write_selected_featureset(X, columns_dropped, f100, 100, release)\n",
869865
" for f in f20:\n",
870866
" print(f, meta1.column_names_to_labels.get(f))"
871867
]
@@ -900,7 +896,7 @@
900896
"# X_su = X[selected_cols]\n",
901897
"# print(\"number of features selected:\",len(selected_cols))\n",
902898
"# print(\"effective rank:\",effective_rank(Xs[selected]))\n",
903-
"# write_data(X_su.reset_index(), treatment_outcomes, \"all_unsupervised_selected.csv\")"
899+
"# write_data(X_su.reset_index(), treatment_outcomes, f\"all_unsupervised_selected_{release}.csv\")"
904900
]
905901
}
906902
],

0 commit comments

Comments
 (0)